コード例 #1
0
    def test_align_file(self):
        pdbin1 = os.path.join(self.testfiles_dir, '1D7M.pdb')
        pdbin2 = os.path.join(self.testfiles_dir, '1GU8.pdb')
        pdbin3 = os.path.join(self.testfiles_dir, '2UUI.pdb')
        s1 = sequence_util.Sequence(pdb=pdbin1)
        s1 += sequence_util.Sequence(pdb=pdbin2)
        s1 += sequence_util.Sequence(pdb=pdbin3)

        ref = ">1D7M.pdb" + os.linesep
        ref += "EMANRLAGLENSLESEKVSREQLIKQKDQLNSLLASLESEGAEREKRLRELEAKLDETLKNLELEKLARMELEARLAKTE" + os.linesep
        ref += "KDRAILELKLAEAIDEKSKLE" + os.linesep
        ref += os.linesep
        ref += ">1D7M.pdb" + os.linesep
        ref += "EMANRLAGLENSLESEKVSREQLIKQKDQLNSLLASLESEGAEREKRLRELEAKLDETLKNLELEKLARMELEARLAKTE" + os.linesep
        ref += "KDRAILELKLAEAIDEKSKLE" + os.linesep
        ref += os.linesep
        ref += ">1GU8.pdb" + os.linesep
        ref += "VGLTTLFWLGAIGMLVGTLAFAWAGRDAGSGERRYYVTLVGISGIAAVAYVVMALGVGWVPVAERTVFAPRYIDWILTTP" + os.linesep
        ref += "LIVYFLGLLAGLDSREFGIVITLNTVVMLAGFAGAMVPGIERYALFGMGAVAFLGLVYYLVGPMTESASQRSSGIKSLYV" + os.linesep
        ref += "RLRNLTVILWAIYPFIWLLGPPGVALLTPTVDVALIVYLDLVTKVGFGFIALDAAATL" + os.linesep
        ref += os.linesep
        ref += ">2UUI.pdb" + os.linesep
        ref += "MHHHHHHKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEFERVYRAQVNCSEYFPLFLATLWVAGIFF" + os.linesep
        ref += "HEGAAALCGLVYLFARLRYFQGYARSAQLRLAPLYASARALWLLVALAALGLLAHFLPAALRAALLGRLRTLLPWA" + os.linesep
        ref += os.linesep

        self.assertEqual(s1.fasta_str(pdbname=True), ref)
コード例 #2
0
 def alignment_file(self, models, alignment_file=None):
     """Create an alignment file for the models - this is based on the assumption they are all the same length
     but may have different residues"""
     if not alignment_file:
         alignment_file = os.path.join(self.work_dir, 'homologs.fasta')
     all_seq = sequence_util.Sequence(pdb=models[0])
     for model in models[1:]:
         all_seq += sequence_util.Sequence(pdb=model)
     if not all(map(lambda x: x == len(all_seq.sequences[0]), [len(s) for s in all_seq.sequences])):
         raise RuntimeError('PDB files are not all of the same length!\n{0}'.format(models))
     all_seq.write_fasta(alignment_file, pdbname=True)
     return alignment_file
コード例 #3
0
    def test_add(self):
        s1 = sequence_util.Sequence(
            pdb=os.path.join(self.testfiles_dir, '1GU8.pdb'))
        s2 = sequence_util.Sequence(
            fasta=os.path.join(self.testfiles_dir, '2uui.fasta'))
        s1 += s2

        self.assertTrue(len(s1.sequences), 2)
        self.assertTrue(len(s1.resseqs), 2)
        self.assertTrue(len(s1.headers), 2)
        self.assertTrue(len(s1.pdbs), 2)
        self.assertTrue(len(s1.chains), 2)
        self.assertTrue(len(s1.fasta_files), 2)
コード例 #4
0
 def test__parse_fasta_2(self):
     fasta = [">foo"]
     fasta += ["AAAAA AA"]
     s = sequence_util.Sequence()
     s._parse_fasta(fasta)
     self.assertListEqual(s.headers, [">foo"])
     self.assertListEqual(s.sequences, ["AAAAAAA"])
コード例 #5
0
def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None):
    """
    Only residues from the first protein are listed in the theseus output, but then not even all of them
    
    We assume the output is based on the original alignment so that where each residue in the first protein 
    lines up with either another residue in one of the other proteins or a gap
    
    SO - we need to go through the theseus data and for each residue that is core find the corresponding residues 
    in the other proteins
    
    We use the resSeq numbers to match the residues across the alignment
    """
    if not os.path.isdir(work_dir):
        os.mkdir(work_dir)

    seqalign = sequence_util.Sequence(fasta=alignment_file)

    # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object
    for m in models:
        seqalign.add_pdb_data(m)

    # Sanity check that the names of the pdb files match those from the fasta header
    # Format is expected to be: '>1ujb.pdb(A)'
    names = [h[1:].split('(')[0] for h in seqalign.headers]
    if not seqalign.pdbs == names:
        raise RuntimeError(
            "headers and names of pdb files do not match!\n{0}\n{1}".format(
                seqalign.pdbs, names))

    # Get the name of the first pdb that the alignment is based on
    first = seqalign.pdbs[0]

    # Dictionary mapping model pdb to resSeqs that are core
    model2core = {}
    for p in seqalign.pdbs:
        model2core[p] = []  # initialise

    # Get list of core resSeqs in the first sequence
    model2core[first] = [x.resSeq for x in var_by_res if x.core]

    # Now go through the first sequence and get the resSeqs of the corresponding core for the other models
    pointer = 0  # Tracks where we are in the first sequence
    for i, resSeq in enumerate(seqalign.resseqs[0]):
        if model2core[first][pointer] == resSeq:
            # Core residue in first sequence so append the corresponding resSeqs for the other proteins
            for j, pdb in enumerate(seqalign.pdbs[1:]):
                model2core[pdb].append(seqalign.resseqs[j + 1][i])
            pointer += 1
            if pointer >= len(model2core[first]):
                break

    core_models = []
    for m in models:
        name = os.path.basename(m)
        pdbout = ample_util.filename_append(m, astr='core', directory=work_dir)
        pdb_edit.select_residues(m, pdbout, tokeep=model2core[name])
        core_models.append(pdbout)

    return core_models
コード例 #6
0
 def test_resseq(self):
     pdbin = os.path.join(self.testfiles_dir, '1D7M.pdb')
     s1 = sequence_util.Sequence(pdb=pdbin)
     self.assertTrue(len(s1.sequences), 2)
     self.assertTrue(len(s1.headers), 2)
     self.assertTrue(len(s1.pdbs), 2)
     self.assertEqual(s1.pdbs[0], os.path.basename(pdbin))
     self.assertTrue(s1.resseqs[0][-1], 343)
コード例 #7
0
ファイル: test_sequence_util.py プロジェクト: hlasimpk/ample
 def test_fail_char(self):
     fp = sequence_util.Sequence()
     # Test case 1 - expected to work
     fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVYAIAFTMYLSMLLGYGLTMVP"]
     try:
         fp.canonicalise()
     except RuntimeError as msg:
         self.assertTrue(False, msg)
     # Test case 2 - expected to fail
     fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVXAIAFTMYLSMLLGYGLTMVP"]
     self.assertRaises(RuntimeError, fp.canonicalise)
コード例 #8
0
    def test_from_pdb(self):
        s1 = sequence_util.Sequence(
            pdb=os.path.join(self.testfiles_dir, '4DZN.pdb'))
        self.assertEqual(s1.name, '4DZN')
        self.assertEqual(s1.pdbs, ['4DZN.pdb', '4DZN.pdb', '4DZN.pdb'])
        self.assertEqual(s1.chains, ['A', 'B', 'C'])

        outfasta = ">From pdb: 4DZN.pdb chain=A length=31" + os.linesep
        outfasta += "GEIAALKQEIAALKKEIAALKEIAALKQGYY" + os.linesep
        outfasta += os.linesep
        outfasta += ">From pdb: 4DZN.pdb chain=B length=31" + os.linesep
        outfasta += "GEIAALKQEIAALKKEIAALKEIAALKQGYY" + os.linesep
        outfasta += os.linesep
        outfasta += ">From pdb: 4DZN.pdb chain=C length=31" + os.linesep
        outfasta += "GEIAALKQEIAALKKEIAALKEIAALKQGYY" + os.linesep
        outfasta += os.linesep

        self.assertEqual(outfasta, "".join(s1.fasta_str()))
コード例 #9
0
    def test_OK(self):
        infasta = ">3HAP:A|PDBID|CHAIN|SEQUENCE" + os.linesep
        infasta += "QAQITGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKKFYAI" + os.linesep
        infasta += "TTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYWARYADWLFTTPLLLLDLALLV" + os.linesep
        infasta += "DADQGTILAAVGADGIMIGTGLVGALTKVYSYRFVWWAISTAA" + os.linesep
        infasta += "MLYILYVLFFGFTSKAESMRPEVASTFKVL" + os.linesep
        infasta += "RNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILL" + os.linesep
        infasta += "RSRAIFGEAEAPEPSAGDGAAATSD"

        fp = sequence_util.Sequence()
        fp._parse_fasta(infasta.split(os.linesep))

        outfasta = ">3HAP:A|PDBID|CHAIN|SEQUENCE" + os.linesep
        outfasta += "QAQITGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKKFYAITTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYW" + os.linesep
        outfasta += "ARYADWLFTTPLLLLDLALLVDADQGTILAAVGADGIMIGTGLVGALTKVYSYRFVWWAISTAAMLYILYVLFFGFTSKA" + os.linesep
        outfasta += "ESMRPEVASTFKVLRNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILLRSRAIFGEAEAPEPSA" + os.linesep
        outfasta += "GDGAAATSD" + os.linesep
        outfasta += os.linesep

        self.assertEqual(outfasta, "".join(fp.fasta_str()))
        self.assertEqual(fp.length(), 249)
コード例 #10
0
    # Get full paths to all files
    args.input_file = os.path.abspath(args.input_file)
    if not os.path.isfile(args.input_file):
        raise RuntimeError("Cannot find input file: {}".format(
            args.input_file))

    if args.output_file:
        args.output_file = os.path.abspath(args.output_file)
    else:
        n = os.path.splitext(os.path.basename(args.input_file))[0]
        args.output_file = n + "_std.pdb"

    if args.ren:
        renumber_residues(args.input_file, args.output_file, start=1)
    elif args.std:
        standardise(args.input_file,
                    args.output_file,
                    del_hetatm=True,
                    chain=args.chain)
    elif args.seq:
        logging.debug(sequence_util.Sequence(pdb=args.input_file).fasta_str())
    elif args.split_models:
        logging.debug(split_pdb(args.input_file))
    elif args.split_chains:
        logging.debug(split_into_chains(args.input_file, chain=args.chain))
    elif args.chain:
        logging.debug(
            extract_chain(args.input_file,
                          args.output_file,
                          chainID=args.chain))
コード例 #11
0
 def test_canonicalise_4(self):
     fp = sequence_util.Sequence()
     fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVXAIAFTMYLSMLLGYGLTMVP*"]
     with self.assertRaises(RuntimeError):
         fp.canonicalise()
コード例 #12
0
 def test_canonicalise_1(self):
     fp = sequence_util.Sequence()
     fp.sequences = ["YFLVKGMGVSDPDAKKFYAITTLVYAIAFTMYLSMLLGYGLTMVP"]
     fp.canonicalise()
     self.assertTrue(True)
コード例 #13
0
    def test_addPdb_data(self):
        fasta1 = os.path.join(self.testfiles_dir, '1ujb_2a6pA_3c7tA.afasta')
        pdbin1 = os.path.join(self.ample_share, 'examples', 'homologs',
                              'input', '1ujbA.pdb')
        pdbin2 = os.path.join(self.ample_share, 'examples', 'homologs',
                              'input', '2a6pA.pdb')
        pdbin3 = os.path.join(self.ample_share, 'examples', 'homologs',
                              'input', '3c7tA.pdb')
        s1 = sequence_util.Sequence(fasta=fasta1)
        s1.add_pdb_data(pdbin1)
        s1.add_pdb_data(pdbin2)
        s1.add_pdb_data(pdbin3)

        self.assertEqual(s1.pdbs[0], os.path.basename(pdbin1))
        self.assertEqual(s1.chains[0], 'A')
        self.assertEqual(s1.pdbs[1], os.path.basename(pdbin2))
        self.assertEqual(s1.chains[1], 'A')
        self.assertEqual(s1.pdbs[2], os.path.basename(pdbin3))
        self.assertEqual(s1.chains[2], 'A')

        p1r = [
            None,
            None,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
            23,
            24,
            25,
            26,
            27,
            28,
            29,
            30,
            31,
            32,
            33,
            34,
            35,
            36,
            37,
            38,
            39,
            40,
            41,
            42,
            43,
            44,
            None,
            45,
            46,
            47,
            48,
            49,
            50,
            51,
            52,
            53,
            54,
            55,
            56,
            57,
            58,
            59,
            60,
            61,
            62,
            63,
            64,
            65,
            66,
            67,
            68,
            69,
            None,
            None,
            70,
            71,
            72,
            73,
            74,
            75,
            76,
            77,
            78,
            79,
            80,
            81,
            82,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            83,
            None,
            None,
            84,
            85,
            86,
            87,
            88,
            89,
            90,
            91,
            None,
            92,
            93,
            94,
            95,
            96,
            97,
            98,
            99,
            100,
            101,
            102,
            103,
            104,
            105,
            106,
            107,
            108,
            109,
            110,
            111,
            112,
            113,
            114,
            115,
            116,
            117,
            118,
            119,
            120,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            121,
            122,
            123,
            None,
            None,
            None,
            124,
            125,
            None,
            None,
            126,
            None,
            None,
            127,
            128,
            129,
            130,
            131,
            132,
            133,
            134,
            135,
            136,
            137,
            138,
            None,
            None,
            139,
            140,
            141,
            142,
            143,
            144,
            145,
            146,
            147,
            148,
            149,
            None,
            None,
            150,
            151,
            None,
            152,
            153,
            154,
            155,
            156,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]

        p2r = [
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            23,
            24,
            25,
            26,
            27,
            28,
            None,
            29,
            30,
            31,
            32,
            33,
            34,
            35,
            36,
            37,
            38,
            39,
            40,
            41,
            42,
            43,
            44,
            45,
            46,
            47,
            48,
            49,
            50,
            None,
            51,
            52,
            53,
            54,
            55,
            56,
            57,
            58,
            59,
            60,
            61,
            62,
            63,
            64,
            65,
            66,
            67,
            68,
            69,
            70,
            71,
            72,
            73,
            None,
            None,
            None,
            None,
            74,
            None,
            None,
            75,
            None,
            76,
            77,
            78,
            79,
            80,
            81,
            82,
            83,
            84,
            85,
            86,
            87,
            88,
            89,
            90,
            91,
            92,
            93,
            94,
            95,
            96,
            97,
            98,
            99,
            100,
            101,
            102,
            103,
            104,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            105,
            106,
            107,
            108,
            109,
            110,
            111,
            112,
            113,
            114,
            115,
            116,
            117,
            118,
            119,
            120,
            121,
            122,
            123,
            124,
            125,
            126,
            127,
            128,
            129,
            130,
            131,
            132,
            None,
            133,
            134,
            135,
            136,
            137,
            None,
            None,
            138,
            139,
            140,
            141,
            142,
            143,
            144,
            145,
            146,
            147,
            148,
            149,
            150,
            151,
            152,
            153,
            154,
            155,
            156,
            157,
            158,
            159,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            160,
            161,
            162,
            163,
            None,
            164,
            165,
            166,
            167,
            168,
            169,
            None,
            None,
            170,
            171,
            172,
            173,
            174,
            175,
            176,
            177,
            178,
            179,
            180,
            181,
            182,
            183,
            184,
            None,
            None,
            None,
            185,
            186,
            187,
            188,
            189,
            190,
            191,
            192,
            None,
            193,
            194,
            195,
            196,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
            None,
        ]

        p3r = [
            None,
            72,
            73,
            74,
            75,
            76,
            77,
            78,
            79,
            80,
            81,
            82,
            83,
            84,
            85,
            86,
            87,
            88,
            89,
            90,
            91,
            92,
            93,
            94,
            95,
            96,
            97,
            98,
            99,
            100,
            101,
            102,
            103,
            104,
            105,
            106,
            107,
            108,
            109,
            110,
            111,
            112,
            113,
            114,
            115,
            116,
            117,
            118,
            119,
            120,
            121,
            122,
            123,
            124,
            None,
            125,
            126,
            127,
            128,
            129,
            130,
            131,
            132,
            133,
            134,
            135,
            136,
            137,
            138,
            139,
            140,
            141,
            142,
            143,
            144,
            145,
            146,
            147,
            148,
            149,
            None,
            150,
            151,
            152,
            153,
            154,
            155,
            156,
            157,
            158,
            159,
            160,
            161,
            162,
            163,
            164,
            165,
            166,
            167,
            168,
            169,
            170,
            171,
            172,
            173,
            174,
            175,
            176,
            177,
            178,
            179,
            180,
            181,
            182,
            183,
            184,
            185,
            186,
            187,
            188,
            189,
            190,
            191,
            192,
            193,
            194,
            195,
            196,
            197,
            198,
            199,
            200,
            201,
            202,
            203,
            204,
            205,
            206,
            207,
            208,
            209,
            210,
            211,
            212,
            213,
            214,
            215,
            216,
            217,
            218,
            219,
            220,
            221,
            222,
            223,
            224,
            225,
            226,
            227,
            228,
            229,
            230,
            231,
            232,
            233,
            234,
            235,
            236,
            237,
            238,
            239,
            240,
            241,
            242,
            243,
            244,
            245,
            246,
            247,
            248,
            249,
            250,
            None,
            251,
            252,
            253,
            254,
            255,
            256,
            257,
            258,
            259,
            260,
            261,
            262,
            263,
            264,
            265,
            266,
            267,
            268,
            269,
            270,
            271,
            272,
            273,
            274,
            275,
            276,
            277,
            278,
            279,
            280,
            281,
            282,
            283,
            284,
            285,
            286,
            287,
            288,
            None,
            None,
            289,
            290,
            291,
            292,
            293,
            294,
            295,
            296,
            297,
            298,
            299,
            300,
            301,
            302,
            303,
            None,
            None,
            304,
            None,
            None,
            None,
            305,
            306,
            307,
            308,
            309,
            310,
            311,
            312,
            313,
            314,
            315,
            None,
            316,
            317,
            318,
            319,
            320,
            321,
            322,
            323,
            324,
            325,
            326,
            327,
            328,
            329,
            330,
        ]

        self.assertEqual(s1.resseqs[0], p1r)
        self.assertEqual(s1.resseqs[1], p2r)
        self.assertEqual(s1.resseqs[2], p3r)
コード例 #14
0
def model_core_from_fasta(models,
                          alignment_file,
                          work_dir=None,
                          case_sensitive=False):
    if not os.path.isdir(work_dir):
        os.mkdir(work_dir)

    # Read in alignment to get
    align_seq = sequence_util.Sequence(fasta=alignment_file)

    # Check all alignments the same length

    # Get pdb names from alignment headers
    seq_names = [h[1:].strip() for h in align_seq.headers]

    # Need to check if the alignment file is from gesamt, in which case, the names have the
    # chain names in brackets appended
    for i, s in enumerate(seq_names):
        x = re.search("\([a-zA-Z]*\)$", s)
        if x:
            seq_names[i] = s.replace(x.group(0), "")

    # Get array specifying which positions are core. If the positions all align, then there
    # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally-
    # aligned residues by lower-case letters
    GAP = '-'
    # Can't use below as Theseus ignores lower-case letters in the alignment
    if case_sensitive:
        core = [
            all([x in ample_util.one2three.keys() for x in t])
            for t in zip(*align_seq.sequences)
        ]
    else:
        core = [all([x != GAP for x in t]) for t in zip(*align_seq.sequences)]

    if not any(core):
        raise RuntimeError(
            "Cannot generate core for models: {0}".format(models))

    # For each sequence, get a list of which positions are core
    core_positions = []
    for seq in align_seq.sequences:
        p = []
        count = 0
        for i, pos in enumerate(seq):
            if pos != GAP:
                if core[i]:
                    p.append(count)
                count += 1
        core_positions.append(p)

    # Should check lengths of sequences match the length of the aa in the pdbs

    # Create dict mapping seq_names to core positions
    core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names))

    # Cut the models down to core
    core_models = []
    for m in models:
        name = os.path.basename(m)
        pdbout = ample_util.filename_append(m, astr='core', directory=work_dir)
        pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name])
        core_models.append(pdbout)

    return core_models