Exemple #1
0
    def test2_generate_peptides_novariants(self):
        """
        Test if a list of proteins is correctly broken into peptide fragments.
        Here the proteins are constructed just from their sequence, having no
        transcript or variant information.
        """
        pep_set = generate_peptides_from_protein(self.prot_set, 3)

        # # Print peptide generator results:
        # for pep in pep_set:
        #     print pep, pep.proteins.items()
        #     print pep, pep.vars.items()
        #     print pep, pep.transcripts.items()

        # get the number of peptides generated for each protein in self.prot_set and sum up
        number_of_peps = sum(len(pep.proteins.keys()) for pep in pep_set)
        # The total number of peptides of length 3 from all proteins in self.pro_set should be 14
        self.assertEqual(number_of_peps, 14)

        # generated pep_set should consist only of unique-sequence entries
        unique_test_prot_set = list()
        unique_test_prot_set.extend(self.prot_set)
        unique_test_prot_set.extend(self.prot_set)

        unique_test_pep_set = generate_peptides_from_protein(unique_test_prot_set, 3)
        unique_test_pep_seqs = set([str(pep) for pep in unique_test_pep_set])
        self.assertEqual(len(unique_test_pep_set), len(unique_test_pep_seqs))
    def setUp(self):
        self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
        testsequences_file = pkg_resources.resource_filename('Fred2', path.join('Data', 'examples', 'testSequences.fasta'))
        with open(testsequences_file, "rU") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
        prot_set = [Protein(str(r.seq)) for r in records]
        unique_test_pep_set = generate_peptides_from_protein(prot_set, 9)
        self.selfpeptides = [str(x) for x in unique_test_pep_set]

        small_prot_set = [Protein("MKERRIDMKEKNVKAKAPNKKVLGLTTKIFIALLAGAILGIVLCYLVPDSSFKKDVIVEGILYVIGQGFIRLMKMLVVPLVFCSLVCGSMAIGDTKKLGTVGVRTLAFYLATTALAVVALGVGNLINPGVGLDMSAIQSSAASVETMEATSLTDTILNIIPDNPINSLASGSMLQVIVFALIVGVILAKMGERAETVANFFSQFNDIMMEMTMMIMSLAPIGVFCLISRTFANIGFSAFIPLAKYMIGVLLALAIQCFGVYQILLKIFTGLNPIRFIKKFFPVMAFAFSTATSNATIPMSIDTLSKKVGVSKKISSFTIPLGATINMDGTSIMQGVAVVFAAQAFGIHLTPMDYVTVIGTATLASVGTAGVPSVGLVTLTMVFNSVGLPVEAIGLIMGIDRILDMTRTAVNITGDAVCTTIVAHQNGALDKKVFNETE"), Protein("MLKVWIAGASGQIGRALNDVLDPMQIEALNTDLDELDITDTDEVINFGTVNRPDVIINCTGITDTDECEANPEHAYRVNALGARNLSIVARKCGSKIVQLSTDDVFDGQSKKPYTEFDDTNPLTVYGRSKRAGENYVKEFTHKHFVIRSNWVYGHGGHNFVNRVLAAAEAGNGLSVASDQFGSPTSAKDLAKMIMYLISTNEYGTYHVTCRGVCSRYEFAQEILKLAGKDIELRAVPTEQSDLSAVRPPYAVLDNFILRIIEVYDMPDWKESLKEYMDERTED")]
        small_unique_test_pep_set = generate_peptides_from_protein(small_prot_set, 9)
        self.fewselfpeptides = [str(x) for x in small_unique_test_pep_set]
Exemple #3
0
    def test2_generate_peptides_novariants(self):
        """
        Test if a list of proteins is correctly broken into peptide fragments.
        Here the proteins are constructed from their sequence, having no
        transcript or variant information.
        (prot_set generation in 'setUp' method)
        """
        def get_total_peps(pep_set):
            _sum = 0
            for pep in pep_set:
                _sum += len(pep.proteins.keys())
            return _sum

        pep_set = generate_peptides_from_protein(self.prot_set, 3)

        # # Print peptide generator results:
        # for pep in pep_set:
        #     print pep, pep.proteins.items()
        #     print pep, pep.vars.items()
        #     print pep, pep.transcripts.items()

        # The total number of fragments should be 14
        # which is the sum over the individual originating proteins
        self.assertEqual(get_total_peps(pep_set), 14)

        # pep_set consists only of unique entries
        pep_unique_seq = set([str(pep) for pep in pep_set])
        self.assertEqual(len(pep_set), len(pep_unique_seq))
Exemple #4
0
    def test3_protein_from_variants(self):
        """
        Generate some transcripts from the 3 input variants
        (should give 8 transcripts, check also if all fields are complete)
        Using a protein made from variants:

        Translate to proteins (check if all fields are there/filled)

        fragment to unique peptides
        (check for uniqueness of sequences, check fields of peptides, check
        correctness of fragments)
        """
        dummy_db = DummyAdapter()
        dummy_vars = [var_10, var_11, var_12]

        proteins = []
        t = list(generate_transcripts_from_variants(dummy_vars, dummy_db))
        for trans in t:
            # check gene id field:
            print trans
            self.assertEqual(trans.gene_id, "gene_1")

            # check trans id name:
            name = trans.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            # check var:
            self.assertIsNotNone(trans.vars)
            self.assertTrue(len(trans.vars) > 0)

            # check sequence:
            self.assertTrue(str(trans) > 5)

            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        self.assertEqual(len(proteins), 8)

        ## CHECK Proteins:
        for prot in proteins:
            self.assertEqual(prot.gene_id, "gene_1")

            # check trans id name:
            name = prot.transcript_id.split(":FRED2_")
            self.assertEqual(len(name), 2)
            self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2")
            self.assertTrue(len(name[1]) == 1 and name[1].isdigit)

            orig = prot.orig_transcript
            self.assertEqual(prot.transcript_id, orig.transcript_id)
            self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars))

            # check sequence:
            self.assertTrue(str(prot) > 2)

        ## GENERATE Peptides:
        peptides = generate_peptides_from_protein(proteins,2)
Exemple #5
0
    def test4_peptides_from_variants(self):
        """
        Ref trancript: AAAAACCCCCGGGGG
        ref protein:   KNPRG
        ref peps(3):   KNPR, NPRG

        variant1: heterozygous, fs+1 in first aa
        variant2: heterozygous, insertion +2 in last aa

        trans-var1: TKPPGA
        1: peps(3): TKPP, KPPG, PPGA

        trans-var2: KNPRG
        2: peps(3): KNPR, NPRG

        Output:
        -------
        PEPTIDE: PPGA
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(15CC)
                 Variant(1C)
        PEPTIDE: KPPG
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)
        PEPTIDE: TKPP
            TRANSCRIPT: tsc_1:FRED2_3
                 Variant(1C)

        PEPTIDE: KNPR
            TRANSCRIPT: tsc_1:FRED2_0
        PEPTIDE: NPRG
            TRANSCRIPT: tsc_1:FRED2_0
        """
        #TODO Somewhere here a print statement is called
        peps_trans1 = ["KNPR", "NPRG"]
        peps_trans2 = ["PPGA", "KPPG", "TKPP"]
        expected_vars = ["Variant(1C)", "Variant(15CC)"]
        expected = peps_trans1 + peps_trans2

        dummy_db = DummyAdapter()
        dummy_vars = [var_13, var_14]

        proteins = []
        transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db))
        for trans in transcripts:
            ### GET PROTS:
            # IGNORE invalid sequence lengths
            try:
                proteins.append(generate_proteins_from_transcripts(trans).next())
            except ValueError:
                pass

        peptides = generate_peptides_from_protein(proteins, 4)

        sequences = [str(pep) for pep in peptides]

        # Check if all peptides are generated as expected
        self.assertTrue(all(pep in sequences for pep in expected))
        # no duplicates or more than the expected ones:
        self.assertEqual(len(peptides), len(expected))

        #vari_peps = [pep.get_all_variants() for pep in peptides \
        #             if str(pep) in peps_trans2]

        #vars_ = [str(var) for varlist in vari_peps for var in varlist]

        # Check that for the peptides from the transcript containing the
        # variants, we also get all expected variants. Especally the first
        # variant needs to be present in all peptides
        for prot in proteins:
            for p in peptides:
                try:
                    vars_ = map(str, p.get_variants_by_protein(prot.transcript_id))
                    expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars]
                    print "peptide vars: ", vars_
                    print "Prot vars: ", expected_vars
                    print repr(p)
                    print repr(prot)
                    self.assertTrue(all(var in expected_vars for var in vars_))
                except ValueError:

                    pass
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+
                                                 "for a specified prediction method and HLA alleles.")
    parser.add_argument("-i", "--input",
                        nargs="+",
                        required=True,
                        help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line),"
                             +" or peptide sequences as sequences (max 50)"
                        )
    input_types = parser.add_mutually_exclusive_group(required=True)
    input_types.add_argument("-r","--refseq",
                             action="store_true",
                             help= "Specifies the input as RefSeq IDs")
    input_types.add_argument("-u","--uniprot",
                             action="store_true",
                             help= "Specifies the input as UniProt IDs")
    input_types.add_argument("-f","--fasta",
                             action="store_true",
                             help= "Specifies the input as protein (multi-)Fasta file")
    input_types.add_argument("-pf","--pepfile",
                             action="store_true",
                             help= "Specifies the input as peptide file")
    input_types.add_argument("-p","--peptide",
                             action="store_true",
                             help= "Specifies the input as peptide sequences")
    parser.add_argument("-a", "--alleles",
                        nargs="+",
                        required=True,
                        help="Specifies for which alleles prediction should be made. " +
                             "Input either can be alleles as string (new nomenclature), or a file with one allele per line.")
    allele_types = parser.add_mutually_exclusive_group(required=True)
    allele_types.add_argument("-af", "--allelefile",
                               action="store_true",
                               help="Specifies the allele input as allele file.")
    allele_types.add_argument("-as", "--allelestring",
                               action="store_true",
                               help="Specifies the allele input as allele string.")
    parser.add_argument("-m", "--method",
                       required=True,
                       nargs="+",
                       help="Specifies the method used for prediction.")
    parser.add_argument("-l", "--length",
                        required=False,
                        type=int,
                        default=9,
                        help="Specifies the length of the peptides (default=9).")
    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output path. Results will be written to CSV")
    parser.add_argument("-am", "--available",
                        required=False,
                        action="store_true",
                        help="Returns all available methods and their allele models.")

    #COMMENT: These options are hidden and only used for ETK2
    parser.add_argument("-html", "--html",
                        required=False,
                        action="store_true",
                        help=argparse.SUPPRESS)
    parser.add_argument("-od", "--outdir",
                        required=False,
                        default="",
                        help=argparse.SUPPRESS)
    args = parser.parse_args()

    if args.available:
        for pred, obj in AEpitopePrediction.registry.iteritems():
            if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]:
                print "Method: ",pred
                print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" ))
                print "Supported Length: ", " ".join(map(str, getattr(obj,  "_"+pred+"__supported_length")))
                print
        sys.exit(0)


    '''
    Parser Input
    '''
    #RefSeq
    if args.refseq:
        pass

    #UniProt
    elif args.uniprot:
        pass

    #fasta protein
    elif args.fasta:
        proteins = FileReader.read_fasta(args.input, type="Protein")
        peptides = generate_peptides_from_protein(proteins, args.length)

    elif args.pepfile:
        peptides = FileReader.read_lines(args.input, type="Peptide")

    elif args.peptide:
        peptides = [Peptide(s) for s in args.input]

    #read in alleles
    if args.allelefile:
        alleles = FileReader.read_lines(args.alleles, type="Allele")
    else:
        alleles = [Allele(a.upper()) for a in args.alleles]

    result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method]
    r_df = result.pop()
    for r in result:
        r_df_a, r_a = r_df.align(r, fill_value=0)
        r_df = r_df_a + r_a

    output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output)
    with open(output, "w") as out:
        r_df.to_csv(out)



    #generate Galaxy HTML output
    if args.html:
        begin_html = """<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" />
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script>
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script>
    <script type="text/javascript" src="/static/scripts/libs/etk.js"></script>
</head>

<body>
    <div class="document">"""

        setting = """  <h2 class="etk-heading">Epitope Prediction Results</h2>

        <table class="etk-parameterT">
            <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr>
            <tr>
                <th>Prediction Method:</th>
                <td>%s</td>
            </tr>
        </table>"""%args.method



        table="""

        <input id="etk-search" placeholder="  filter">
        <table class="etk-sortT etk-resultsT etk-filterT">

            <thead>
                <tr>
                    <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \
            +"""
                </tr>
            </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c])))
                                for r in result.index for c in result.columns)+"</table>"

        end_html = "</div></body></html>"

        html_out = ".".join(output.split(".")[:-1])+".html"
        with open(html_out, "w") as html_o:
            html_o.write(begin_html+setting+table+end_html)