Exemple #1
0
def pdb_contacts(pdb, chain, dist):
    i = 0
    # Get chain code from 6th letter in pdb name
    pdb_chain = pdb_getchain(pdb, chain)
    ppb = CaPPBuilder()
    # Initialise building of a polypeptide and its sequence
    # If a mutated residue is present in a chain it is classed as a hetatm
    # However, not all hetatms in a chain are part of the sequence. The CaPPBuilder
    # makes sequences by requiring CA-CA distances to be <4.3A. Common hetatms are
    # identified such that an MSE hetatm will be replaced by an M in the sequence
    polypepTot = ppb.build_peptides(pdb_chain, aa_only=False)[0]
    sequen = polypepTot.get_sequence()

    # Add to the polypeptide
    for polypep_raw in ppb.build_peptides(pdb_chain, aa_only=False)[1:]:
        sequen += (polypep_raw.get_sequence())
        polypepTot += polypep_raw

    i = 0

    # Sometimes the terminal residue in a protein isn't fully resolved
    last_res = polypepTot[-1]
    if last_res.has_id("CA") or last_res.has_id("CB"):
        polypep = polypepTot  # If resolved take whole AA
        file_seq.write(">sequence\n%s\n" % sequen)
        file_seq.write("%s" % sequen)
    else:
        polypep = polypepTot[:-1]  # Otherwise take all but the last AA
        file_seq.write(">sequence\n%s\n" % sequen[:-1])
        file_seq.write("%s" % sequen[:-1])

    file_map.write(str(len(polypep)) + "\n")
    #	sys.stderr.write(pdb+'\n')

    for residue1 in polypep:
        # Quite frequently residues do not have resolved CB, in which case use CA
        # If no CA exists, print ERROR. Grep the output if running unsupervised.
        try:
            if residue1.has_id("CB"):  #get_resname() == "GLY":
                c_alpha = residue1["CB"]
            else:
                c_alpha = residue1["CA"]
        except:
            sys.stdout.write("ERROR")
            raise
        i += 1
        j = 0
        for residue2 in polypep:
            try:
                if residue2.has_id("CB"):  #get_resname() == "GLY":
                    c_alpha2 = residue2["CB"]
                else:
                    c_alpha2 = residue2["CA"]
            except:
                file_map.write("ERROR")
                raise
            j += 1
            if (norm(c_alpha.get_coord(), c_alpha2.get_coord()) <
                    dist):  # 3.5 ):
                file_map.write("%d %d\n" % (i - 1, j - 1))
 def test_insertions(self):
     """Test file with residue insertion codes."""
     parser = MMCIFParser(QUIET=1)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", PDBConstructionWarning)
         structure = parser.get_structure("example", "PDB/4ZHL.cif")
     for ppbuild in [PPBuilder(), CaPPBuilder()]:
         # First try allowing non-standard amino acids,
         polypeptides = ppbuild.build_peptides(structure[0], False)
         self.assertEqual(len(polypeptides), 2)
         pp = polypeptides[0]
         # Check the start and end positions (first segment only)
         self.assertEqual(pp[0].get_id()[1], 16)
         self.assertEqual(pp[-1].get_id()[1], 244)
         # Check the sequence
         refseq = (
             "IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATHCFIDYPKKEDYIVYLGR"
             "SRLNSNTQGEMKFEVENLILHKDYSADTLAYHNDIALLKIRSKEGRCAQPSRTIQTIALPSMY"
             "NDPQFGTSCEITGFGKEQSTDYLYPEQLKMTVVKLISHRECQQPHYYGSEVTTKMLCAADPQW"
             "KTDSCQGDSGGPLVCSLQGRMTLTGIVSWGRGCALKDKPGVYTRVSHFLPWIRSHTKE"
         )
         s = pp.get_sequence()
         self.assertIsInstance(s, Seq)
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual(refseq, str(s))
Exemple #3
0
def pdb_polypep(pdb, chain, trim):
    i = 0
    # Get chain code from 6th letter in pdb name
    pdb_chain = pdb_getchain(pdb, chain)
    ppb = CaPPBuilder()
    # Initialise building of a polypeptide and its sequence
    # If a mutated residue is present in a chain it is classed as a hetatm
    # However, not all hetatms in a chain are part of the sequence. The CaPPBuilder
    # makes sequences by requiring CA-CA distances to be <4.3A. Common hetatms are
    # identified such that an MSE hetatm will be replaced by an M in the sequence
    polypepTot = ppb.build_peptides(pdb_chain, aa_only=False)[0]
    sequen = polypepTot.get_sequence()
    # Add to the polypeptide
    for polypep_raw in ppb.build_peptides(pdb_chain, aa_only=False)[1:]:
        sequen += (polypep_raw.get_sequence())
        polypepTot += polypep_raw
# Remove unstructured terminal ends
    if trim:
        polypepTot = pp_trim(polypepTot)
    # Sometimes the terminal residue in a protein isn't fully resolved
    last_res = polypepTot[-1]
    if last_res.has_id("CA") or last_res.has_id("CB"):
        polypep = polypepTot  # If resolved take whole AA
#		file_seq.write(">sequence\n%s\n" %sequen)
##		file_seq.write("%s" %sequen)
    else:
        polypep = polypepTot[:-1]  # Otherwise take all but the last AA


#		file_seq.write(">sequence\n%s\n" %sequen[:-1])
##		file_seq.write("%s" %sequen[:-1])
#	file_map.write( str(len(polypep)) +"\n" )
#	sys.stderr.write(pdb+'\n')
    return polypep
Exemple #4
0
def read_pdb_file(file_name, name=None):
    """
    Extract info from a PDB file
        file_name: path of pdb file
        name: name of the structure (default name of the file without extension)
        return:: (structure,R,polypeptides,sequence,seq_res_dict)

            structure: structure object
            residues: list of residues
            polypeptides: list of polypeptides in the structure
            sequence: combined sequence (for all polypeptides)
            seq_res_dict: Sequence to residues mapping index list, sequence[i] corresponds to
                residues[seq_res_dict[i]]
    """

    if name is None:
        name = splitext(file_name)[0]

    structure = PDBParser().get_structure(name, file_name)

    if len(structure) != 1:
        raise ValueError("Unexpected number of structures in " + name)

    # residues = Selection.unfold_entities(structure, 'R')
    atoms = Selection.unfold_entities(structure, 'A')
    polypeptides = PPBuilder().build_peptides(structure)
    if len(polypeptides) == 0:
        polypeptides = CaPPBuilder().build_peptides(structure)
    sequence = ''.join([str(p.get_sequence()) for p in polypeptides])
    residues = [
        residue for polypeptide in polypeptides for residue in polypeptide
    ]
    protein_name = os.path.basename(file_name).replace(".pdb", "")
    return protein_name, structure, residues, sequence, atoms
Exemple #5
0
 def test_parser(self):
     """Extract polypeptides from 1A80."""
     parser = MMCIFParser()
     structure = parser.get_structure("example", "PDB/1A8O.cif")
     self.assertEqual(len(structure), 1)
     for ppbuild in [PPBuilder(), CaPPBuilder()]:
         # ==========================================================
         # Check that serial_num (model column) is stored properly
         self.assertEqual(structure[0].serial_num, 1)
         # First try allowing non-standard amino acids,
         polypeptides = ppbuild.build_peptides(structure[0], False)
         self.assertEqual(len(polypeptides), 1)
         pp = polypeptides[0]
         # Check the start and end positions
         self.assertEqual(pp[0].get_id()[1], 151)
         self.assertEqual(pp[-1].get_id()[1], 220)
         # Check the sequence
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         # Here non-standard MSE are shown as M
         self.assertEqual(
             "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ"
             "NANPDCKTILKALGPGATLEEMMTACQG", str(s))
         # ==========================================================
         # Now try strict version with only standard amino acids
         # Should ignore MSE 151 at start, and then break the chain
         # at MSE 185, and MSE 214,215
         polypeptides = ppbuild.build_peptides(structure[0], True)
         self.assertEqual(len(polypeptides), 3)
         # First fragment
         pp = polypeptides[0]
         self.assertEqual(pp[0].get_id()[1], 152)
         self.assertEqual(pp[-1].get_id()[1], 184)
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s))
         # Second fragment
         pp = polypeptides[1]
         self.assertEqual(pp[0].get_id()[1], 186)
         self.assertEqual(pp[-1].get_id()[1], 213)
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s))
         # Third fragment
         pp = polypeptides[2]
         self.assertEqual(pp[0].get_id()[1], 216)
         self.assertEqual(pp[-1].get_id()[1], 220)
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual("TACQG", str(s))
    def testModels(self):
        """Test file with multiple models."""
        parser = MMCIFParser(QUIET=1)
        f_parser = FastMMCIFParser(QUIET=1)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", PDBConstructionWarning)
            structure = parser.get_structure("example", "PDB/1LCD.cif")
            f_structure = f_parser.get_structure("example", "PDB/1LCD.cif")

        self.assertEqual(len(structure), 3)
        self.assertEqual(len(f_structure), 3)

        for ppbuild in [PPBuilder(), CaPPBuilder()]:
            # ==========================================================
            # Check that serial_num (model column) is stored properly
            self.assertEqual(structure[0].serial_num, 1)
            self.assertEqual(structure[1].serial_num, 2)
            self.assertEqual(structure[2].serial_num, 3)
            # First try allowing non-standard amino acids,
            polypeptides = ppbuild.build_peptides(structure[0], False)
            self.assertEqual(len(polypeptides), 1)
            pp = polypeptides[0]
            # Check the start and end positions
            self.assertEqual(pp[0].get_id()[1], 1)
            self.assertEqual(pp[-1].get_id()[1], 51)
            # Check the sequence
            s = pp.get_sequence()
            self.assertIsInstance(s, Seq)
            self.assertEqual(s.alphabet, generic_protein)
            # Here non-standard MSE are shown as M
            self.assertEqual(
                "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s)
            )
            # ==========================================================
            # Now try strict version with only standard amino acids
            polypeptides = ppbuild.build_peptides(structure[0], True)
            self.assertEqual(len(polypeptides), 1)
            pp = polypeptides[0]
            # Check the start and end positions
            self.assertEqual(pp[0].get_id()[1], 1)
            self.assertEqual(pp[-1].get_id()[1], 51)
            # Check the sequence
            s = pp.get_sequence()
            self.assertIsInstance(s, Seq)
            self.assertEqual(s.alphabet, generic_protein)
            self.assertEqual(
                "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s)
            )

        # This structure contains several models with multiple lengths.
        # The tests were failing.
        structure = parser.get_structure("example", "PDB/2OFG.cif")
        self.assertEqual(len(structure), 3)
Exemple #7
0
 def test_polypeptide(self):
     """Tests on polypetide class and methods."""
     p = PDBParser(PERMISSIVE=True)
     pdb1 = "PDB/1A8O.pdb"
     s = p.get_structure("scr", pdb1)
     ppb = PPBuilder()
     pp = ppb.build_peptides(s)
     self.assertEqual(str(pp[0].get_sequence()),
                      "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
     self.assertEqual(str(pp[1].get_sequence()),
                      "TETLLVQNANPDCKTILKALGPGATLEE")
     self.assertEqual(str(pp[2].get_sequence()), "TACQG")
     phi_psi = pp[0].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3)
     self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3)
     self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3)
     phi_psi = pp[1].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3)
     self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3)
     self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3)
     phi_psi = pp[2].get_phi_psi_list()
     self.assertEqual(phi_psi[0][0], None)
     self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3)
     self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3)
     self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3)
     self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3)
     self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3)
     ppb = CaPPBuilder()
     pp = ppb.build_peptides(s)
     self.assertEqual(str(pp[0].get_sequence()),
                      "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
     self.assertEqual(str(pp[1].get_sequence()),
                      "TETLLVQNANPDCKTILKALGPGATLEE")
     self.assertEqual(str(pp[2].get_sequence()), "TACQG")
     self.assertEqual([ca.serial_number for ca in pp[0].get_ca_list()], [
         10, 18, 26, 37, 46, 50, 57, 66, 75, 82, 93, 104, 112, 124, 131,
         139, 150, 161, 173, 182, 189, 197, 208, 213, 222, 231, 236, 242,
         251, 260, 267, 276, 284
     ])
     taus = pp[1].get_tau_list()
     self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3)
     self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3)
     self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3)
     thetas = pp[2].get_theta_list()
     self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3)
     self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3)
     self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
Exemple #8
0
    def test_cappbuilder_tau(self):
        """Test tau angles calculated with CaPPBuilder."""
        ppb = CaPPBuilder()
        pp = ppb.build_peptides(self.structure)

        taus = pp[1].get_tau_list()
        self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3)
        self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3)
        self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3)
        thetas = pp[2].get_theta_list()
        self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3)
        self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3)
        self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
Exemple #9
0
    def test_cappbuilder_real(self):
        """Test CaPPBuilder on real PDB file."""
        ppb = CaPPBuilder()
        pp = ppb.build_peptides(self.structure)

        pp0_seq = pp[0].get_sequence()
        pp1_seq = pp[1].get_sequence()
        pp2_seq = pp[2].get_sequence()
        self.assertEqual(pp0_seq, "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW")
        self.assertEqual(pp1_seq, "TETLLVQNANPDCKTILKALGPGATLEE")
        self.assertEqual(pp2_seq, "TACQG")
        self.assertEqual(
            [ca.serial_number for ca in pp[0].get_ca_list()],
            [
                10,
                18,
                26,
                37,
                46,
                50,
                57,
                66,
                75,
                82,
                93,
                104,
                112,
                124,
                131,
                139,
                150,
                161,
                173,
                182,
                189,
                197,
                208,
                213,
                222,
                231,
                236,
                242,
                251,
                260,
                267,
                276,
                284,
            ],
        )
Exemple #10
0
def run_test():
    from Bio.PDB import PDBParser, PPBuilder, CaPPBuilder

    # first make a PDB parser object
    p = PDBParser(PERMISSIVE=1)

    # get the structure, call it "example"
    structure = p.get_structure("example", "PDB/a_structure.pdb")

    # now loop over content and print some info
    for model in structure.get_list():
        model_id = model.get_id()
        print "Model %i contains %i chains." % (model_id, len(model))
        for chain in model.get_list():
            chain_id = chain.get_id()
            print "\tChain '%s' contains %i residues." % (chain_id, len(chain))
            for residue in chain.get_list():
                residue_id = residue.get_id()
                hetfield, resseq, icode = residue_id
                print "\t\tResidue ('%s', %i, '%s') contains %i atoms." % (
                    hetfield, resseq, icode, len(residue))
                # check if there is disorder due to a point mutation --- this is rare
                if residue.is_disordered() == 2:
                    print "\t\t\tThere is a point mutation present in the crystal at this position."
                    s = "\t\t\tResidues at this position are "
                    for resname in residue.disordered_get_id_list():
                        s = s + resname + " "
                    print s[:-1] + "."
                # count the number of disordered atoms
                if residue.is_disordered() == 1:
                    disordered_count = 0
                    for atom in residue.get_list():
                        if atom.is_disordered():
                            disordered_count = disordered_count + 1
                    if disordered_count > 0:
                        print "\t\t\tThe residue contains %i disordered atoms." % disordered_count

    print "Polypeptides using C-N"
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure[1]):
        print pp

    print "Polypeptides using CA-CA"
    ppb = CaPPBuilder()
    for pp in ppb.build_peptides(structure[1]):
        print pp

    print "NeighborSearch test"
    quick_neighbor_search_test()
Exemple #11
0
 def test_ca_ca(self):
     """Extract polypeptides using CA-CA."""
     ppbuild = CaPPBuilder()
     polypeptides = ppbuild.build_peptides(self.structure[1])
     self.assertEqual(len(polypeptides), 1)
     pp = polypeptides[0]
     # Check the start and end positions
     self.assertEqual(pp[0].get_id()[1], 2)
     self.assertEqual(pp[-1].get_id()[1], 86)
     # Check the sequence
     s = pp.get_sequence()
     self.assertTrue(isinstance(s, Seq))
     self.assertEqual(s.alphabet, generic_protein)
     self.assertEqual("RCGSQGGGSTCPGLRCCSIWGWCGDSEPYCGRTCENKCWSGER"
                      "SDHRCGAAVGNPPCGQDRCCSVHGWCGGGNDYCSGGNCQYRC",
                      str(s))
Exemple #12
0
    def test_cappbuilder_real_nonstd(self):
        """Test CaPPBuilder on real PDB file allowing non-standard amino acids."""
        ppb = CaPPBuilder()
        pp = ppb.build_peptides(self.structure, False)

        self.assertEqual(len(pp), 1)

        # Check the start and end positions
        self.assertEqual(pp[0][0].get_id()[1], 151)
        self.assertEqual(pp[0][-1].get_id()[1], 220)

        # Check the sequence
        s = pp[0].get_sequence()
        self.assertIsInstance(s, Seq)
        # Here non-standard MSE are shown as M
        self.assertEqual(
            "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG",
            s)
Exemple #13
0
def pdb_sequence(pdb_file, id=None, method="order"):
    from Bio.PDB import PDBParser, CaPPBuilder
    from Bio.PDB.Polypeptide import three_to_one
    if id is None:
        id = util.make_id_from_file_name(pdb_file)
    parser = PDBParser()
    structure = parser.get_structure(id, pdb_file)
    seq_chains = []
    for chain in structure.get_chains():
        id_chain = chain.get_id()
        if method == "distance":
            ppb = CaPPBuilder()
            seq = sum((pp.get_sequence() for pp in ppb.build_peptides(chain)),
                      Seq("", IUPAC.protein))
            seq_spec = None  #TODO: implement
        elif method == "order":
            seq = []
            seq_spec = []
            for res in chain.get_residues():
                seq.append(three_to_one(res.get_resname()))
                ## from Bio docs, res.get_full_id() returns: ("1abc", 0, "A", (" ", 10, "A"))
                fid = res.get_full_id()
                seq_spec.append(
                    pdb_seq_spec(chain=fid[-2].strip(),
                                 resn=res.get_resname(),
                                 resi=fid[-1][-2],
                                 ins=fid[-1][-1].strip()))

            seq = Seq("".join(seq), IUPAC.protein)
        else:
            raise ValueError("Unknown method: {}".format(method))

        seq_chains.append(
            dict(id_chain=id_chain,
                 seq_rec=SeqRecord(seq,
                                   id="{}_{}".format(id, id_chain),
                                   description=""),
                 seq_spec=seq_spec))
        chains_map = dict(((x["id_chain"], x) for x in seq_chains))
    return pdb_seqs(id=id, chains=seq_chains, chains_map=chains_map)
Exemple #14
0
 def testModels(self):
     """Test file with multiple models"""
     parser = MMCIFParser()
     structure = parser.get_structure("example", "PDB/1LCD.cif")
     self.assertEqual(len(structure), 3)
     for ppbuild in [PPBuilder(), CaPPBuilder()]:
         #==========================================================
         # Check that serial_num (model column) is stored properly
         self.assertEqual(structure[0].serial_num, 1)
         self.assertEqual(structure[1].serial_num, 2)
         self.assertEqual(structure[2].serial_num, 3)
         #First try allowing non-standard amino acids,
         polypeptides = ppbuild.build_peptides(structure[0], False)
         self.assertEqual(len(polypeptides), 1)
         pp = polypeptides[0]
         # Check the start and end positions
         self.assertEqual(pp[0].get_id()[1], 1)
         self.assertEqual(pp[-1].get_id()[1], 51)
         # Check the sequence
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         #Here non-standard MSE are shown as M
         self.assertEqual(
             "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s))
         #==========================================================
         #Now try strict version with only standard amino acids
         polypeptides = ppbuild.build_peptides(structure[0], True)
         self.assertEqual(len(polypeptides), 1)
         pp = polypeptides[0]
         # Check the start and end positions
         self.assertEqual(pp[0].get_id()[1], 1)
         self.assertEqual(pp[-1].get_id()[1], 51)
         # Check the sequence
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual(
             "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s))
Exemple #15
0
 def _remove_missing_res(self, record: SeqRecord, pdb: Path):
     structure = PDBParser().get_structure(record.id, pdb)
     sequence = ''.join([
         str(_.get_sequence())
         for _ in CaPPBuilder().build_peptides(structure, aa_only=False)
     ])
     path = PairwiseAligner().align(record.seq.ungap('-'), sequence)[0].path
     gaps = []
     for i, _ in enumerate(path[:-1]):
         if path[i][1] == path[i + 1][1]:
             gaps.append((path[i][0], path[i + 1][0]))
     gaps = list(reversed(gaps))
     mut = record.seq.tomutable()
     for gap in gaps:
         i = 0
         for k, res in enumerate(mut):
             if res == '-':
                 continue
             if gap[0] <= i < gap[1]:
                 mut[k] = '-'
             i += 1
     record.seq = mut.toseq()
     return record
    def test_parsers(self):
        """Extract polypeptides from 1A80."""
        parser = MMCIFParser()
        fast_parser = FastMMCIFParser()

        structure = parser.get_structure("example", "PDB/1A8O.cif")
        f_structure = fast_parser.get_structure("example", "PDB/1A8O.cif")

        self.assertEqual(len(structure), 1)
        self.assertEqual(len(f_structure), 1)

        for ppbuild in [PPBuilder(), CaPPBuilder()]:
            # ==========================================================
            # Check that serial_num (model column) is stored properly
            self.assertEqual(structure[0].serial_num, 1)
            self.assertEqual(f_structure[0].serial_num, structure[0].serial_num)

            # First try allowing non-standard amino acids,
            polypeptides = ppbuild.build_peptides(structure[0], False)
            f_polypeptides = ppbuild.build_peptides(f_structure[0], False)

            self.assertEqual(len(polypeptides), 1)
            self.assertEqual(len(f_polypeptides), 1)

            pp = polypeptides[0]
            f_pp = f_polypeptides[0]

            # Check the start and end positions
            self.assertEqual(pp[0].get_id()[1], 151)
            self.assertEqual(pp[-1].get_id()[1], 220)

            self.assertEqual(f_pp[0].get_id()[1], 151)
            self.assertEqual(f_pp[-1].get_id()[1], 220)

            # Check the sequence
            s = pp.get_sequence()
            f_s = f_pp.get_sequence()

            self.assertEqual(s, f_s)  # enough to test this

            self.assertIsInstance(s, Seq)
            self.assertEqual(s.alphabet, generic_protein)

            # Here non-standard MSE are shown as M
            self.assertEqual(
                "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ"
                "NANPDCKTILKALGPGATLEEMMTACQG",
                str(s),
            )

            # ==========================================================
            # Now try strict version with only standard amino acids
            # Should ignore MSE 151 at start, and then break the chain
            # at MSE 185, and MSE 214,215
            polypeptides = ppbuild.build_peptides(structure[0], True)
            self.assertEqual(len(polypeptides), 3)

            # First fragment
            pp = polypeptides[0]
            self.assertEqual(pp[0].get_id()[1], 152)
            self.assertEqual(pp[-1].get_id()[1], 184)
            s = pp.get_sequence()
            self.assertIsInstance(s, Seq)
            self.assertEqual(s.alphabet, generic_protein)
            self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s))

            # Second fragment
            pp = polypeptides[1]
            self.assertEqual(pp[0].get_id()[1], 186)
            self.assertEqual(pp[-1].get_id()[1], 213)
            s = pp.get_sequence()
            self.assertIsInstance(s, Seq)
            self.assertEqual(s.alphabet, generic_protein)
            self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s))

            # Third fragment
            pp = polypeptides[2]
            self.assertEqual(pp[0].get_id()[1], 216)
            self.assertEqual(pp[-1].get_id()[1], 220)
            s = pp.get_sequence()
            self.assertIsInstance(s, Seq)
            self.assertEqual(s.alphabet, generic_protein)
            self.assertEqual("TACQG", str(s))

        s_atoms = list(structure.get_atoms())
        f_atoms = list(f_structure.get_atoms())

        for atoms in [s_atoms, f_atoms]:
            self.assertEqual(len(atoms), 644)
            atom_names = ["N", "CA", "C", "O", "CB"]
            self.assertSequenceEqual([a.get_name() for a in atoms[:5]], atom_names)
            self.assertSequenceEqual([a.get_id() for a in atoms[:5]], atom_names)
            self.assertSequenceEqual([a.get_fullname() for a in atoms[:5]], atom_names)
            self.assertSequenceEqual(
                [a.get_occupancy() for a in atoms[:5]], [1.0, 1.0, 1.0, 1.0, 1.0]
            )
            self.assertIsInstance(atoms[0].get_coord(), numpy.ndarray)
            coord = numpy.array([19.594, 32.367, 28.012], dtype=numpy.float32)
            numpy.testing.assert_array_equal(atoms[0].get_coord(), coord)

            self.assertEqual(atoms[0].get_bfactor(), 18.03)
            for atom in atoms:
                self.assertIsNone(atom.get_anisou())