def test2_generate_peptides_novariants(self): """ Test if a list of proteins is correctly broken into peptide fragments. Here the proteins are constructed just from their sequence, having no transcript or variant information. """ pep_set = generate_peptides_from_protein(self.prot_set, 3) # # Print peptide generator results: # for pep in pep_set: # print pep, pep.proteins.items() # print pep, pep.vars.items() # print pep, pep.transcripts.items() # get the number of peptides generated for each protein in self.prot_set and sum up number_of_peps = sum(len(pep.proteins.keys()) for pep in pep_set) # The total number of peptides of length 3 from all proteins in self.pro_set should be 14 self.assertEqual(number_of_peps, 14) # generated pep_set should consist only of unique-sequence entries unique_test_prot_set = list() unique_test_prot_set.extend(self.prot_set) unique_test_prot_set.extend(self.prot_set) unique_test_pep_set = generate_peptides_from_protein(unique_test_prot_set, 3) unique_test_pep_seqs = set([str(pep) for pep in unique_test_pep_set]) self.assertEqual(len(unique_test_pep_set), len(unique_test_pep_seqs))
def setUp(self): self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] testsequences_file = pkg_resources.resource_filename('Fred2', path.join('Data', 'examples', 'testSequences.fasta')) with open(testsequences_file, "rU") as handle: records = list(SeqIO.parse(handle, "fasta")) prot_set = [Protein(str(r.seq)) for r in records] unique_test_pep_set = generate_peptides_from_protein(prot_set, 9) self.selfpeptides = [str(x) for x in unique_test_pep_set] small_prot_set = [Protein("MKERRIDMKEKNVKAKAPNKKVLGLTTKIFIALLAGAILGIVLCYLVPDSSFKKDVIVEGILYVIGQGFIRLMKMLVVPLVFCSLVCGSMAIGDTKKLGTVGVRTLAFYLATTALAVVALGVGNLINPGVGLDMSAIQSSAASVETMEATSLTDTILNIIPDNPINSLASGSMLQVIVFALIVGVILAKMGERAETVANFFSQFNDIMMEMTMMIMSLAPIGVFCLISRTFANIGFSAFIPLAKYMIGVLLALAIQCFGVYQILLKIFTGLNPIRFIKKFFPVMAFAFSTATSNATIPMSIDTLSKKVGVSKKISSFTIPLGATINMDGTSIMQGVAVVFAAQAFGIHLTPMDYVTVIGTATLASVGTAGVPSVGLVTLTMVFNSVGLPVEAIGLIMGIDRILDMTRTAVNITGDAVCTTIVAHQNGALDKKVFNETE"), Protein("MLKVWIAGASGQIGRALNDVLDPMQIEALNTDLDELDITDTDEVINFGTVNRPDVIINCTGITDTDECEANPEHAYRVNALGARNLSIVARKCGSKIVQLSTDDVFDGQSKKPYTEFDDTNPLTVYGRSKRAGENYVKEFTHKHFVIRSNWVYGHGGHNFVNRVLAAAEAGNGLSVASDQFGSPTSAKDLAKMIMYLISTNEYGTYHVTCRGVCSRYEFAQEILKLAGKDIELRAVPTEQSDLSAVRPPYAVLDNFILRIIEVYDMPDWKESLKEYMDERTED")] small_unique_test_pep_set = generate_peptides_from_protein(small_prot_set, 9) self.fewselfpeptides = [str(x) for x in small_unique_test_pep_set]
def test2_generate_peptides_novariants(self): """ Test if a list of proteins is correctly broken into peptide fragments. Here the proteins are constructed from their sequence, having no transcript or variant information. (prot_set generation in 'setUp' method) """ def get_total_peps(pep_set): _sum = 0 for pep in pep_set: _sum += len(pep.proteins.keys()) return _sum pep_set = generate_peptides_from_protein(self.prot_set, 3) # # Print peptide generator results: # for pep in pep_set: # print pep, pep.proteins.items() # print pep, pep.vars.items() # print pep, pep.transcripts.items() # The total number of fragments should be 14 # which is the sum over the individual originating proteins self.assertEqual(get_total_peps(pep_set), 14) # pep_set consists only of unique entries pep_unique_seq = set([str(pep) for pep in pep_set]) self.assertEqual(len(pep_set), len(pep_unique_seq))
def test3_protein_from_variants(self): """ Generate some transcripts from the 3 input variants (should give 8 transcripts, check also if all fields are complete) Using a protein made from variants: Translate to proteins (check if all fields are there/filled) fragment to unique peptides (check for uniqueness of sequences, check fields of peptides, check correctness of fragments) """ dummy_db = DummyAdapter() dummy_vars = [var_10, var_11, var_12] proteins = [] t = list(generate_transcripts_from_variants(dummy_vars, dummy_db)) for trans in t: # check gene id field: print trans self.assertEqual(trans.gene_id, "gene_1") # check trans id name: name = trans.transcript_id.split(":FRED2_") self.assertEqual(len(name), 2) self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2") self.assertTrue(len(name[1]) == 1 and name[1].isdigit) # check var: self.assertIsNotNone(trans.vars) self.assertTrue(len(trans.vars) > 0) # check sequence: self.assertTrue(str(trans) > 5) ### GET PROTS: # IGNORE invalid sequence lengths try: proteins.append(generate_proteins_from_transcripts(trans).next()) except ValueError: pass self.assertEqual(len(proteins), 8) ## CHECK Proteins: for prot in proteins: self.assertEqual(prot.gene_id, "gene_1") # check trans id name: name = prot.transcript_id.split(":FRED2_") self.assertEqual(len(name), 2) self.assertTrue(name[0] == "tsc_1" or name[0] == "tsc_2") self.assertTrue(len(name[1]) == 1 and name[1].isdigit) orig = prot.orig_transcript self.assertEqual(prot.transcript_id, orig.transcript_id) self.assertEqual(len(set(e for subl in prot.vars.itervalues() for e in subl)), len(orig.vars)) # check sequence: self.assertTrue(str(prot) > 2) ## GENERATE Peptides: peptides = generate_peptides_from_protein(proteins,2)
def test4_peptides_from_variants(self): """ Ref trancript: AAAAACCCCCGGGGG ref protein: KNPRG ref peps(3): KNPR, NPRG variant1: heterozygous, fs+1 in first aa variant2: heterozygous, insertion +2 in last aa trans-var1: TKPPGA 1: peps(3): TKPP, KPPG, PPGA trans-var2: KNPRG 2: peps(3): KNPR, NPRG Output: ------- PEPTIDE: PPGA TRANSCRIPT: tsc_1:FRED2_3 Variant(15CC) Variant(1C) PEPTIDE: KPPG TRANSCRIPT: tsc_1:FRED2_3 Variant(1C) PEPTIDE: TKPP TRANSCRIPT: tsc_1:FRED2_3 Variant(1C) PEPTIDE: KNPR TRANSCRIPT: tsc_1:FRED2_0 PEPTIDE: NPRG TRANSCRIPT: tsc_1:FRED2_0 """ #TODO Somewhere here a print statement is called peps_trans1 = ["KNPR", "NPRG"] peps_trans2 = ["PPGA", "KPPG", "TKPP"] expected_vars = ["Variant(1C)", "Variant(15CC)"] expected = peps_trans1 + peps_trans2 dummy_db = DummyAdapter() dummy_vars = [var_13, var_14] proteins = [] transcripts = list(generate_transcripts_from_variants(dummy_vars, dummy_db)) for trans in transcripts: ### GET PROTS: # IGNORE invalid sequence lengths try: proteins.append(generate_proteins_from_transcripts(trans).next()) except ValueError: pass peptides = generate_peptides_from_protein(proteins, 4) sequences = [str(pep) for pep in peptides] # Check if all peptides are generated as expected self.assertTrue(all(pep in sequences for pep in expected)) # no duplicates or more than the expected ones: self.assertEqual(len(peptides), len(expected)) #vari_peps = [pep.get_all_variants() for pep in peptides \ # if str(pep) in peps_trans2] #vars_ = [str(var) for varlist in vari_peps for var in varlist] # Check that for the peptides from the transcript containing the # variants, we also get all expected variants. Especally the first # variant needs to be present in all peptides for prot in proteins: for p in peptides: try: vars_ = map(str, p.get_variants_by_protein(prot.transcript_id)) expected_vars = [str(v) for vars in prot.vars.itervalues() for v in vars] print "peptide vars: ", vars_ print "Prot vars: ", expected_vars print repr(p) print repr(prot) self.assertTrue(all(var in expected_vars for var in vars_)) except ValueError: pass
def main(): parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+ "for a specified prediction method and HLA alleles.") parser.add_argument("-i", "--input", nargs="+", required=True, help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line)," +" or peptide sequences as sequences (max 50)" ) input_types = parser.add_mutually_exclusive_group(required=True) input_types.add_argument("-r","--refseq", action="store_true", help= "Specifies the input as RefSeq IDs") input_types.add_argument("-u","--uniprot", action="store_true", help= "Specifies the input as UniProt IDs") input_types.add_argument("-f","--fasta", action="store_true", help= "Specifies the input as protein (multi-)Fasta file") input_types.add_argument("-pf","--pepfile", action="store_true", help= "Specifies the input as peptide file") input_types.add_argument("-p","--peptide", action="store_true", help= "Specifies the input as peptide sequences") parser.add_argument("-a", "--alleles", nargs="+", required=True, help="Specifies for which alleles prediction should be made. " + "Input either can be alleles as string (new nomenclature), or a file with one allele per line.") allele_types = parser.add_mutually_exclusive_group(required=True) allele_types.add_argument("-af", "--allelefile", action="store_true", help="Specifies the allele input as allele file.") allele_types.add_argument("-as", "--allelestring", action="store_true", help="Specifies the allele input as allele string.") parser.add_argument("-m", "--method", required=True, nargs="+", help="Specifies the method used for prediction.") parser.add_argument("-l", "--length", required=False, type=int, default=9, help="Specifies the length of the peptides (default=9).") parser.add_argument("-o", "--output", required=True, help="Specifies the output path. Results will be written to CSV") parser.add_argument("-am", "--available", required=False, action="store_true", help="Returns all available methods and their allele models.") #COMMENT: These options are hidden and only used for ETK2 parser.add_argument("-html", "--html", required=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-od", "--outdir", required=False, default="", help=argparse.SUPPRESS) args = parser.parse_args() if args.available: for pred, obj in AEpitopePrediction.registry.iteritems(): if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]: print "Method: ",pred print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" )) print "Supported Length: ", " ".join(map(str, getattr(obj, "_"+pred+"__supported_length"))) print sys.exit(0) ''' Parser Input ''' #RefSeq if args.refseq: pass #UniProt elif args.uniprot: pass #fasta protein elif args.fasta: proteins = FileReader.read_fasta(args.input, type="Protein") peptides = generate_peptides_from_protein(proteins, args.length) elif args.pepfile: peptides = FileReader.read_lines(args.input, type="Peptide") elif args.peptide: peptides = [Peptide(s) for s in args.input] #read in alleles if args.allelefile: alleles = FileReader.read_lines(args.alleles, type="Allele") else: alleles = [Allele(a.upper()) for a in args.alleles] result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method] r_df = result.pop() for r in result: r_df_a, r_a = r_df.align(r, fill_value=0) r_df = r_df_a + r_a output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output) with open(output, "w") as out: r_df.to_csv(out) #generate Galaxy HTML output if args.html: begin_html = """<?xml version="1.0" encoding="utf-8" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" /> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script> <script type="text/javascript" src="/static/scripts/libs/etk.js"></script> </head> <body> <div class="document">""" setting = """ <h2 class="etk-heading">Epitope Prediction Results</h2> <table class="etk-parameterT"> <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr> <tr> <th>Prediction Method:</th> <td>%s</td> </tr> </table>"""%args.method table=""" <input id="etk-search" placeholder=" filter"> <table class="etk-sortT etk-resultsT etk-filterT"> <thead> <tr> <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \ +""" </tr> </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c]))) for r in result.index for c in result.columns)+"</table>" end_html = "</div></body></html>" html_out = ".".join(output.split(".")[:-1])+".html" with open(html_out, "w") as html_o: html_o.write(begin_html+setting+table+end_html)