def main(input_peptides, input_vaccine_sequences, output_vaccine_epitopes, verbose, log_file): ''' Reads the vaccine produced by Fischer's online tool and converts it into epitopes ''' global LOGGER LOGGER = utilities.init_logging(verbose, log_file, log_append=False) LOGGER.info('Reading peptides...') with open(input_peptides) as f: peptides = set(r['peptide'] for r in csv.DictReader(f)) LOGGER.info('Read %d peptides', len(peptides)) LOGGER.info('Reading vaccine...') mosaics = FileReader.read_fasta(input_vaccine_sequences, in_type=Protein) LOGGER.info('Vaccine has %d mosaic(s)', len(mosaics)) with open(output_vaccine_epitopes, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) for c, mos in enumerate(mosaics): pep_count = unk_count = 0 for i in range(0, len(mos) - 8): pep = mos[i:i + 9] assert len(pep) == 9 if pep in peptides: writer.writerow((c, pep_count, pep)) pep_count += 1 else: unk_count += 1 LOGGER.info('Mosaic %d - Recognized: %d Unknown %d', c + 1, pep_count, unk_count)
def read_annotated_proteins(proteins_file): ''' Reads proteins from a fasta file and extracts their metadata from the header. Currently follows the format of the HIV database ''' proteins = FileReader.read_fasta(proteins_file, in_type=Protein) for prot in proteins: parts = prot.transcript_id.split('.') prot.transcript_id = parts[-1] return proteins
def main(): parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+ "for a specified prediction method and HLA alleles.") parser.add_argument("-i", "--input", nargs="+", required=True, help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line)," +" or peptide sequences as sequences (max 50)" ) input_types = parser.add_mutually_exclusive_group(required=True) input_types.add_argument("-r","--refseq", action="store_true", help= "Specifies the input as RefSeq IDs") input_types.add_argument("-u","--uniprot", action="store_true", help= "Specifies the input as UniProt IDs") input_types.add_argument("-f","--fasta", action="store_true", help= "Specifies the input as protein (multi-)Fasta file") input_types.add_argument("-pf","--pepfile", action="store_true", help= "Specifies the input as peptide file") input_types.add_argument("-p","--peptide", action="store_true", help= "Specifies the input as peptide sequences") parser.add_argument("-a", "--alleles", nargs="+", required=True, help="Specifies for which alleles prediction should be made. " + "Input either can be alleles as string (new nomenclature), or a file with one allele per line.") allele_types = parser.add_mutually_exclusive_group(required=True) allele_types.add_argument("-af", "--allelefile", action="store_true", help="Specifies the allele input as allele file.") allele_types.add_argument("-as", "--allelestring", action="store_true", help="Specifies the allele input as allele string.") parser.add_argument("-m", "--method", required=True, nargs="+", help="Specifies the method used for prediction.") parser.add_argument("-l", "--length", required=False, type=int, default=9, help="Specifies the length of the peptides (default=9).") parser.add_argument("-o", "--output", required=True, help="Specifies the output path. Results will be written to CSV") parser.add_argument("-am", "--available", required=False, action="store_true", help="Returns all available methods and their allele models.") #COMMENT: These options are hidden and only used for ETK2 parser.add_argument("-html", "--html", required=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-od", "--outdir", required=False, default="", help=argparse.SUPPRESS) args = parser.parse_args() if args.available: for pred, obj in AEpitopePrediction.registry.iteritems(): if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]: print "Method: ",pred print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" )) print "Supported Length: ", " ".join(map(str, getattr(obj, "_"+pred+"__supported_length"))) print sys.exit(0) ''' Parser Input ''' #RefSeq if args.refseq: pass #UniProt elif args.uniprot: pass #fasta protein elif args.fasta: proteins = FileReader.read_fasta(args.input, type="Protein") peptides = generate_peptides_from_protein(proteins, args.length) elif args.pepfile: peptides = FileReader.read_lines(args.input, type="Peptide") elif args.peptide: peptides = [Peptide(s) for s in args.input] #read in alleles if args.allelefile: alleles = FileReader.read_lines(args.alleles, type="Allele") else: alleles = [Allele(a.upper()) for a in args.alleles] result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method] r_df = result.pop() for r in result: r_df_a, r_a = r_df.align(r, fill_value=0) r_df = r_df_a + r_a output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output) with open(output, "w") as out: r_df.to_csv(out) #generate Galaxy HTML output if args.html: begin_html = """<?xml version="1.0" encoding="utf-8" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" /> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script> <script type="text/javascript" src="/static/scripts/libs/etk.js"></script> </head> <body> <div class="document">""" setting = """ <h2 class="etk-heading">Epitope Prediction Results</h2> <table class="etk-parameterT"> <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr> <tr> <th>Prediction Method:</th> <td>%s</td> </tr> </table>"""%args.method table=""" <input id="etk-search" placeholder=" filter"> <table class="etk-sortT etk-resultsT etk-filterT"> <thead> <tr> <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \ +""" </tr> </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c]))) for r in result.index for c in result.columns)+"</table>" end_html = "</div></body></html>" html_out = ".".join(output.split(".")[:-1])+".html" with open(html_out, "w") as html_o: html_o.write(begin_html+setting+table+end_html)
def test_read_fasta(self): seqs = FileReader.read_fasta(self.fa_path) self.assertEqual(len(seqs), 2) seqs = FileReader.read_fasta(self.fa_unconventional_path) # no "|" self.assertEqual(len(seqs), 174)
def extract_peptides(input_sequences, max_edits, output_peptides, top_n): ''' Extract peptides from the given sequences and computes protein coverage for each peptide. Coverage can be computed allowing for inexact matching. In other words, it first generates all peptides that appear in the input proteins, and stores which proteins each peptide appears in. Then, for every peptide, it finds all peptides that can be obtained by changing at most max-edits aminoacids, and counts the proteins that contain the edited peptides. ''' LOGGER.info('Reading sequences...') proteins = FileReader.read_fasta(input_sequences, in_type=Protein) LOGGER.info('%d proteins read', len(proteins)) LOGGER.info('Extracting protein coverage for each peptide...') all_peptides = utilities.Trie() proteins_by_peptide = {} for i, prot in enumerate(proteins): aminoacids = ''.join( c for c in prot._data if c.isalpha()) # remove non-aminoacids from alignment peptides_in_this_protein = set( ) # make sure we only count peptides once per protein for j in range(len(aminoacids) - 8): seq = str(aminoacids[j:j + 9]) if seq not in peptides_in_this_protein: peptides_in_this_protein.add(seq) all_peptides.insert(seq) if seq not in proteins_by_peptide: proteins_by_peptide[seq] = set() proteins_by_peptide[seq].add(i) if utilities.is_percent_barrier(i, len(proteins), 5): LOGGER.debug( '%d proteins analyzed (%.2f%%) and %d peptides extracted...', i + 1, 100 * (i + 1) / len(proteins), len(proteins_by_peptide)) LOGGER.info('Computing reachability...') top_peptides = [] with open(output_peptides, 'w') as f: writer = csv.writer(f) writer.writerow(('peptide', 'proteins')) for i, peptide in enumerate(proteins_by_peptide): # find reachable peptides and which proteins they belong to reachable_proteins = set() for reachable, edits in all_peptides.reachable_strings( peptide, max_edits): reachable_proteins.update(proteins_by_peptide[reachable]) # now either update the top N or save reachability to file if top_n > 0: heapq.heappush( top_peptides, (len(reachable_proteins), peptide, reachable_proteins)) if len(top_peptides) > top_n: heapq.heappop(top_peptides) else: writer.writerow( (peptide, ';'.join(list(map(str, reachable_proteins))))) if utilities.is_percent_barrier(i, len(proteins_by_peptide), 2.5): LOGGER.debug('%d peptides analyzed (%.2f%%)...', i + 1, 100 * (i + 1) / len(proteins_by_peptide)) # save the top N to file if top_n > 0: LOGGER.info('Saving top peptides to file') for _, peptide, proteins in top_peptides: writer.writerow( (peptide, ','.join(list(map(str, reachable_proteins)))))
def vaccine(input_sequences, input_peptides, input_alleles, input_epitopes, input_vaccine, output_summary, verbose): # load vaccine with open(input_vaccine) as f: vaccine = {} for row in csv.DictReader(f): if row['cocktail'] not in vaccine: vaccine[row['cocktail']] = {} vaccine[row['cocktail']][int(row['index'])] = row['epitope'] cocktail = [] for mosaic in vaccine.values(): ordered = sorted(mosaic.items(), key=lambda x: x[0]) cocktail.append([e for _, e in ordered]) LOGGER.info('Vaccine loaded') # load alleles allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict( 'index') LOGGER.info('Loaded %d alleles', len(allele_data)) # load peptides coverage peptides = {} with open(input_peptides) as f: for row in csv.DictReader(f): peptides[row['peptide']] = row['proteins'].split(';') LOGGER.info('Loaded %d peptides with coverage', len(peptides)) # load epitopes (also fill peptides since some design methods do not use epitopes) epitope_data = { pep: { 'immunogen': 0.0, 'alleles': [], 'proteins': prots } for pep, prots in peptides.items() } with open(input_epitopes) as f: for row in csv.DictReader(f): row['immunogen'] = float(row['immunogen']) row['alleles'] = row['alleles'].split( ';') if row['alleles'] else [] row['proteins'] = row['proteins'].split(';') if row['immunogen'] > 0: epitope_data[row['epitope']] = row LOGGER.info('Loaded %d epitopes', len(epitope_data)) # load sequences proteins = FileReader.read_fasta(input_sequences, in_type=Protein) LOGGER.info('Loaded %d proteins', len(proteins)) # print stats for each mosaic for i, mosaic in enumerate(cocktail): LOGGER.info('---') LOGGER.info('Mosaic #%d - %d epitopes', i + 1, len(mosaic)) for epi in mosaic: LOGGER.info(' %s', epi) evaluate_epitopes(mosaic, epitope_data, allele_data, len(proteins)) # write csv LOGGER.info('---') vaccine_stats = evaluate_epitopes( [epi for mosaic in cocktail for epi in mosaic], epitope_data, allele_data, len(proteins)) with open(output_summary, 'w') as f: writer = csv.DictWriter(f, vaccine_stats.keys()) writer.writeheader() writer.writerow(vaccine_stats)
def get_mosaic_solver_instance(logger, input_proteins, input_alleles, input_epitopes, input_overlaps, **kwargs): top_immunogen = kwargs.pop('top_immunogen') top_alleles = kwargs.pop('top_alleles') top_proteins = kwargs.pop('top_proteins') min_overlap = kwargs.get('min_overlap', 0) cocktail = kwargs.get('cocktail', 1) greedy_subtour = kwargs.get('greedy_subtour') max_epitopes = kwargs.get('max_epitopes') max_aminoacids = kwargs.get('max_aminoacids') min_alleles = kwargs.get('min_alleles', 0) min_proteins = kwargs.get('min_proteins', 0) min_avg_prot_conservation = kwargs.get('min_avg_prot_conservation', 0) min_avg_alle_conservation = kwargs.get('min_avg_alle_conservation', 0) # load proteins logger.info('Reading sequences...') proteins = FileReader.read_fasta(input_proteins, in_type=Protein) logger.info('%d proteins read', len(proteins)) # load alleles alleles = [ Allele(a) for a in get_alleles_and_thresholds(input_alleles).index ] logger.info('Loaded %d alleles', len(alleles)) # load epitopes epitope_data = list( load_epitopes(input_epitopes, top_immunogen, top_alleles, top_proteins).values()) logger.info('Loaded %d epitopes', len(epitope_data)) # load edge cost logger.info('Loading overlaps...') vertex_rewards = [0] + [b['immunogen'] for b in epitope_data] edges = load_edges_from_overlaps(input_overlaps, min_overlap, [b['epitope'] for b in epitope_data]) logger.info('Kept %d edges (from %d)', len(edges), len(epitope_data) * (len(epitope_data) + 1)) # compute hla and protein coverage logger.info('Computing coverage matrix...') type_coverage, min_type_coverage, min_avg_type_conservation = compute_coverage_matrix( epitope_data, min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation, len(proteins), len(alleles)) # find optimal design solver = TeamOrienteeringIlp( num_teams=cocktail, vertex_reward=vertex_rewards, edge_cost=edges, max_edge_cost=0, max_vertices=0, lazy_subtour_elimination=not greedy_subtour, type_coverage=type_coverage, min_type_coverage=min_type_coverage, min_avg_type_conservation=min_avg_type_conservation, ) if isinstance(max_epitopes, (int, float)): solver.update_max_vertices(max_epitopes) if isinstance(max_aminoacids, (int, float)): solver.update_max_edge_cost(max_aminoacids) return solver, { 'proteins': proteins, 'alleles': alleles, 'epitope_data': epitope_data, }
def string_of_beads(input_proteins, input_alleles, input_epitopes, input_cleavages, output_vaccine, cocktail, greedy_subtour, max_aminoacids, max_epitopes, min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation): program_start_time = time.time() # load proteins LOGGER.info('Reading sequences...') proteins = FileReader.read_fasta(input_proteins, in_type=Protein) LOGGER.info('%d proteins read', len(proteins)) # load alleles alleles = [ Allele(a) for a in utilities.get_alleles_and_thresholds(input_alleles).index ] LOGGER.info('Loaded %d alleles', len(alleles)) # load epitopes epitopes = utilities.load_epitopes(input_epitopes) LOGGER.info('Loaded %d epitopes', len(epitopes)) # read cleavage scores cleavage_epitopes = set() with open(input_cleavages) as f: cleavages = {} for row in csv.DictReader(f): cleavages[(row['from'], row['to'])] = float(row['score']) cleavage_epitopes.add(row['from']) cleavage_epitopes.add(row['to']) LOGGER.info('Loaded %d cleavage scores', len(cleavages)) # compute edge cost edge_cost, vertices, vertices_rewards = [], [], [] vertex_to_epitope = [''] + list(cleavage_epitopes) for ep_from in vertex_to_epitope: vertices.append(ep_from) vertices_rewards.append(0 if ep_from == '' else epitopes[ep_from]['immunogen']) edge_cost.append([ cleavages[(ep_from, ep_to)] if ep_from != '' and ep_to != '' else 0.0 for ep_to in vertex_to_epitope ]) LOGGER.info('Kept %d epitopes with available clevages', len(vertices) - 1) type_coverage, min_type_coverage, min_avg_type_conservation = utilities.compute_coverage_matrix( [epitopes[e] for e in vertex_to_epitope[1:]], min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation, len(proteins), len(alleles)) # find optimal design solver_build_time = time.time() solver = TeamOrienteeringIlp( num_teams=cocktail, vertex_reward=vertices_rewards, edge_cost=edge_cost, type_coverage=type_coverage, min_type_coverage=min_type_coverage, min_avg_type_conservation=min_avg_type_conservation, max_edge_cost=max_aminoacids, max_vertices=max_epitopes, lazy_subtour_elimination=not greedy_subtour) solver.build_model() solver_start_time = time.time() result = solver.solve() solver_end_time = time.time() # print info and save with open(output_vaccine, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) for i, mosaic in enumerate(result): LOGGER.info('Mosaic #%d', i + 1) for j, (_, vertex) in enumerate(mosaic[:-1]): epitope = epitopes[vertex_to_epitope[vertex]] writer.writerow((i, j, epitope['epitope'])) LOGGER.info(' %s - IG: %.2f', epitope['epitope'], epitope['immunogen']) LOGGER.info('==== Stopwatch') LOGGER.info(' Total time : %.2f s', solver_end_time - program_start_time) LOGGER.info(' Pre-processing : %.2f s', solver_build_time - program_start_time) LOGGER.info(' Model creation time : %.2f s', solver_start_time - solver_build_time) LOGGER.info(' Solving time : %.2f s', solver_end_time - solver_start_time)