def main(input_peptides, input_vaccine_sequences, output_vaccine_epitopes,
         verbose, log_file):
    ''' Reads the vaccine produced by Fischer's online tool and converts it into epitopes
    '''
    global LOGGER
    LOGGER = utilities.init_logging(verbose, log_file, log_append=False)

    LOGGER.info('Reading peptides...')
    with open(input_peptides) as f:
        peptides = set(r['peptide'] for r in csv.DictReader(f))
    LOGGER.info('Read %d peptides', len(peptides))

    LOGGER.info('Reading vaccine...')
    mosaics = FileReader.read_fasta(input_vaccine_sequences, in_type=Protein)
    LOGGER.info('Vaccine has %d mosaic(s)', len(mosaics))

    with open(output_vaccine_epitopes, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))

        for c, mos in enumerate(mosaics):
            pep_count = unk_count = 0
            for i in range(0, len(mos) - 8):
                pep = mos[i:i + 9]
                assert len(pep) == 9

                if pep in peptides:
                    writer.writerow((c, pep_count, pep))
                    pep_count += 1
                else:
                    unk_count += 1

            LOGGER.info('Mosaic %d - Recognized: %d Unknown %d', c + 1,
                        pep_count, unk_count)
Ejemplo n.º 2
0
def read_annotated_proteins(proteins_file):
    ''' Reads proteins from a fasta file and extracts their metadata from the header.
        Currently follows the format of the HIV database
    '''
    proteins = FileReader.read_fasta(proteins_file, in_type=Protein)
    for prot in proteins:
        parts = prot.transcript_id.split('.')
        prot.transcript_id = parts[-1]
    return proteins
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+
                                                 "for a specified prediction method and HLA alleles.")
    parser.add_argument("-i", "--input",
                        nargs="+",
                        required=True,
                        help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line),"
                             +" or peptide sequences as sequences (max 50)"
                        )
    input_types = parser.add_mutually_exclusive_group(required=True)
    input_types.add_argument("-r","--refseq",
                             action="store_true",
                             help= "Specifies the input as RefSeq IDs")
    input_types.add_argument("-u","--uniprot",
                             action="store_true",
                             help= "Specifies the input as UniProt IDs")
    input_types.add_argument("-f","--fasta",
                             action="store_true",
                             help= "Specifies the input as protein (multi-)Fasta file")
    input_types.add_argument("-pf","--pepfile",
                             action="store_true",
                             help= "Specifies the input as peptide file")
    input_types.add_argument("-p","--peptide",
                             action="store_true",
                             help= "Specifies the input as peptide sequences")
    parser.add_argument("-a", "--alleles",
                        nargs="+",
                        required=True,
                        help="Specifies for which alleles prediction should be made. " +
                             "Input either can be alleles as string (new nomenclature), or a file with one allele per line.")
    allele_types = parser.add_mutually_exclusive_group(required=True)
    allele_types.add_argument("-af", "--allelefile",
                               action="store_true",
                               help="Specifies the allele input as allele file.")
    allele_types.add_argument("-as", "--allelestring",
                               action="store_true",
                               help="Specifies the allele input as allele string.")
    parser.add_argument("-m", "--method",
                       required=True,
                       nargs="+",
                       help="Specifies the method used for prediction.")
    parser.add_argument("-l", "--length",
                        required=False,
                        type=int,
                        default=9,
                        help="Specifies the length of the peptides (default=9).")
    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output path. Results will be written to CSV")
    parser.add_argument("-am", "--available",
                        required=False,
                        action="store_true",
                        help="Returns all available methods and their allele models.")

    #COMMENT: These options are hidden and only used for ETK2
    parser.add_argument("-html", "--html",
                        required=False,
                        action="store_true",
                        help=argparse.SUPPRESS)
    parser.add_argument("-od", "--outdir",
                        required=False,
                        default="",
                        help=argparse.SUPPRESS)
    args = parser.parse_args()

    if args.available:
        for pred, obj in AEpitopePrediction.registry.iteritems():
            if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]:
                print "Method: ",pred
                print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" ))
                print "Supported Length: ", " ".join(map(str, getattr(obj,  "_"+pred+"__supported_length")))
                print
        sys.exit(0)


    '''
    Parser Input
    '''
    #RefSeq
    if args.refseq:
        pass

    #UniProt
    elif args.uniprot:
        pass

    #fasta protein
    elif args.fasta:
        proteins = FileReader.read_fasta(args.input, type="Protein")
        peptides = generate_peptides_from_protein(proteins, args.length)

    elif args.pepfile:
        peptides = FileReader.read_lines(args.input, type="Peptide")

    elif args.peptide:
        peptides = [Peptide(s) for s in args.input]

    #read in alleles
    if args.allelefile:
        alleles = FileReader.read_lines(args.alleles, type="Allele")
    else:
        alleles = [Allele(a.upper()) for a in args.alleles]

    result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method]
    r_df = result.pop()
    for r in result:
        r_df_a, r_a = r_df.align(r, fill_value=0)
        r_df = r_df_a + r_a

    output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output)
    with open(output, "w") as out:
        r_df.to_csv(out)



    #generate Galaxy HTML output
    if args.html:
        begin_html = """<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" />
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script>
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script>
    <script type="text/javascript" src="/static/scripts/libs/etk.js"></script>
</head>

<body>
    <div class="document">"""

        setting = """  <h2 class="etk-heading">Epitope Prediction Results</h2>

        <table class="etk-parameterT">
            <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr>
            <tr>
                <th>Prediction Method:</th>
                <td>%s</td>
            </tr>
        </table>"""%args.method



        table="""

        <input id="etk-search" placeholder="  filter">
        <table class="etk-sortT etk-resultsT etk-filterT">

            <thead>
                <tr>
                    <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \
            +"""
                </tr>
            </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c])))
                                for r in result.index for c in result.columns)+"</table>"

        end_html = "</div></body></html>"

        html_out = ".".join(output.split(".")[:-1])+".html"
        with open(html_out, "w") as html_o:
            html_o.write(begin_html+setting+table+end_html)
Ejemplo n.º 4
0
 def test_read_fasta(self):
     seqs = FileReader.read_fasta(self.fa_path)
     self.assertEqual(len(seqs), 2)
     seqs = FileReader.read_fasta(self.fa_unconventional_path)  # no "|"
     self.assertEqual(len(seqs), 174)
def extract_peptides(input_sequences, max_edits, output_peptides, top_n):
    ''' Extract peptides from the given sequences and computes protein coverage for each peptide.
        Coverage can be computed allowing for inexact matching.

        In other words, it first generates all peptides that appear in the input proteins,
        and stores which proteins each peptide appears in. Then, for every peptide, it
        finds all peptides that can be obtained by changing at most max-edits aminoacids,
        and counts the proteins that contain the edited peptides.
    '''
    LOGGER.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_sequences, in_type=Protein)
    LOGGER.info('%d proteins read', len(proteins))

    LOGGER.info('Extracting protein coverage for each peptide...')
    all_peptides = utilities.Trie()
    proteins_by_peptide = {}

    for i, prot in enumerate(proteins):
        aminoacids = ''.join(
            c for c in prot._data
            if c.isalpha())  # remove non-aminoacids from alignment
        peptides_in_this_protein = set(
        )  # make sure we only count peptides once per protein
        for j in range(len(aminoacids) - 8):
            seq = str(aminoacids[j:j + 9])
            if seq not in peptides_in_this_protein:
                peptides_in_this_protein.add(seq)
                all_peptides.insert(seq)
                if seq not in proteins_by_peptide:
                    proteins_by_peptide[seq] = set()
                proteins_by_peptide[seq].add(i)

        if utilities.is_percent_barrier(i, len(proteins), 5):
            LOGGER.debug(
                '%d proteins analyzed (%.2f%%) and %d peptides extracted...',
                i + 1, 100 * (i + 1) / len(proteins), len(proteins_by_peptide))

    LOGGER.info('Computing reachability...')
    top_peptides = []
    with open(output_peptides, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('peptide', 'proteins'))

        for i, peptide in enumerate(proteins_by_peptide):
            # find reachable peptides and which proteins they belong to
            reachable_proteins = set()
            for reachable, edits in all_peptides.reachable_strings(
                    peptide, max_edits):
                reachable_proteins.update(proteins_by_peptide[reachable])

            # now either update the top N or save reachability to file
            if top_n > 0:
                heapq.heappush(
                    top_peptides,
                    (len(reachable_proteins), peptide, reachable_proteins))
                if len(top_peptides) > top_n:
                    heapq.heappop(top_peptides)
            else:
                writer.writerow(
                    (peptide, ';'.join(list(map(str, reachable_proteins)))))

            if utilities.is_percent_barrier(i, len(proteins_by_peptide), 2.5):
                LOGGER.debug('%d peptides analyzed (%.2f%%)...', i + 1,
                             100 * (i + 1) / len(proteins_by_peptide))

        # save the top N to file
        if top_n > 0:
            LOGGER.info('Saving top peptides to file')
            for _, peptide, proteins in top_peptides:
                writer.writerow(
                    (peptide, ','.join(list(map(str, reachable_proteins)))))
Ejemplo n.º 6
0
 def test_read_fasta(self):
     seqs = FileReader.read_fasta(self.fa_path)
     self.assertEqual(len(seqs), 2)
     seqs = FileReader.read_fasta(self.fa_unconventional_path)  # no "|"
     self.assertEqual(len(seqs), 174)
Ejemplo n.º 7
0
def vaccine(input_sequences, input_peptides, input_alleles, input_epitopes,
            input_vaccine, output_summary, verbose):
    # load vaccine
    with open(input_vaccine) as f:
        vaccine = {}
        for row in csv.DictReader(f):
            if row['cocktail'] not in vaccine:
                vaccine[row['cocktail']] = {}
            vaccine[row['cocktail']][int(row['index'])] = row['epitope']

        cocktail = []
        for mosaic in vaccine.values():
            ordered = sorted(mosaic.items(), key=lambda x: x[0])
            cocktail.append([e for _, e in ordered])
    LOGGER.info('Vaccine loaded')

    # load alleles
    allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict(
        'index')
    LOGGER.info('Loaded %d alleles', len(allele_data))

    # load peptides coverage
    peptides = {}
    with open(input_peptides) as f:
        for row in csv.DictReader(f):
            peptides[row['peptide']] = row['proteins'].split(';')
    LOGGER.info('Loaded %d peptides with coverage', len(peptides))

    # load epitopes (also fill peptides since some design methods do not use epitopes)
    epitope_data = {
        pep: {
            'immunogen': 0.0,
            'alleles': [],
            'proteins': prots
        }
        for pep, prots in peptides.items()
    }
    with open(input_epitopes) as f:
        for row in csv.DictReader(f):
            row['immunogen'] = float(row['immunogen'])
            row['alleles'] = row['alleles'].split(
                ';') if row['alleles'] else []
            row['proteins'] = row['proteins'].split(';')
            if row['immunogen'] > 0:
                epitope_data[row['epitope']] = row
    LOGGER.info('Loaded %d epitopes', len(epitope_data))

    # load sequences
    proteins = FileReader.read_fasta(input_sequences, in_type=Protein)
    LOGGER.info('Loaded %d proteins', len(proteins))

    # print stats for each mosaic
    for i, mosaic in enumerate(cocktail):
        LOGGER.info('---')
        LOGGER.info('Mosaic #%d - %d epitopes', i + 1, len(mosaic))
        for epi in mosaic:
            LOGGER.info('    %s', epi)
        evaluate_epitopes(mosaic, epitope_data, allele_data, len(proteins))

    # write csv
    LOGGER.info('---')
    vaccine_stats = evaluate_epitopes(
        [epi for mosaic in cocktail for epi in mosaic], epitope_data,
        allele_data, len(proteins))
    with open(output_summary, 'w') as f:
        writer = csv.DictWriter(f, vaccine_stats.keys())
        writer.writeheader()
        writer.writerow(vaccine_stats)
Ejemplo n.º 8
0
def get_mosaic_solver_instance(logger, input_proteins, input_alleles,
                               input_epitopes, input_overlaps, **kwargs):
    top_immunogen = kwargs.pop('top_immunogen')
    top_alleles = kwargs.pop('top_alleles')
    top_proteins = kwargs.pop('top_proteins')
    min_overlap = kwargs.get('min_overlap', 0)
    cocktail = kwargs.get('cocktail', 1)
    greedy_subtour = kwargs.get('greedy_subtour')
    max_epitopes = kwargs.get('max_epitopes')
    max_aminoacids = kwargs.get('max_aminoacids')
    min_alleles = kwargs.get('min_alleles', 0)
    min_proteins = kwargs.get('min_proteins', 0)
    min_avg_prot_conservation = kwargs.get('min_avg_prot_conservation', 0)
    min_avg_alle_conservation = kwargs.get('min_avg_alle_conservation', 0)

    # load proteins
    logger.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_proteins, in_type=Protein)
    logger.info('%d proteins read', len(proteins))

    # load alleles
    alleles = [
        Allele(a) for a in get_alleles_and_thresholds(input_alleles).index
    ]
    logger.info('Loaded %d alleles', len(alleles))

    # load epitopes
    epitope_data = list(
        load_epitopes(input_epitopes, top_immunogen, top_alleles,
                      top_proteins).values())
    logger.info('Loaded %d epitopes', len(epitope_data))

    # load edge cost
    logger.info('Loading overlaps...')
    vertex_rewards = [0] + [b['immunogen'] for b in epitope_data]
    edges = load_edges_from_overlaps(input_overlaps, min_overlap,
                                     [b['epitope'] for b in epitope_data])
    logger.info('Kept %d edges (from %d)', len(edges),
                len(epitope_data) * (len(epitope_data) + 1))

    # compute hla and protein coverage
    logger.info('Computing coverage matrix...')
    type_coverage, min_type_coverage, min_avg_type_conservation = compute_coverage_matrix(
        epitope_data, min_alleles, min_proteins, min_avg_prot_conservation,
        min_avg_alle_conservation, len(proteins), len(alleles))

    # find optimal design
    solver = TeamOrienteeringIlp(
        num_teams=cocktail,
        vertex_reward=vertex_rewards,
        edge_cost=edges,
        max_edge_cost=0,
        max_vertices=0,
        lazy_subtour_elimination=not greedy_subtour,
        type_coverage=type_coverage,
        min_type_coverage=min_type_coverage,
        min_avg_type_conservation=min_avg_type_conservation,
    )

    if isinstance(max_epitopes, (int, float)):
        solver.update_max_vertices(max_epitopes)

    if isinstance(max_aminoacids, (int, float)):
        solver.update_max_edge_cost(max_aminoacids)

    return solver, {
        'proteins': proteins,
        'alleles': alleles,
        'epitope_data': epitope_data,
    }
Ejemplo n.º 9
0
def string_of_beads(input_proteins, input_alleles, input_epitopes,
                    input_cleavages, output_vaccine, cocktail, greedy_subtour,
                    max_aminoacids, max_epitopes, min_alleles, min_proteins,
                    min_avg_prot_conservation, min_avg_alle_conservation):
    program_start_time = time.time()

    # load proteins
    LOGGER.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_proteins, in_type=Protein)
    LOGGER.info('%d proteins read', len(proteins))

    # load alleles
    alleles = [
        Allele(a)
        for a in utilities.get_alleles_and_thresholds(input_alleles).index
    ]
    LOGGER.info('Loaded %d alleles', len(alleles))

    # load epitopes
    epitopes = utilities.load_epitopes(input_epitopes)
    LOGGER.info('Loaded %d epitopes', len(epitopes))

    # read cleavage scores
    cleavage_epitopes = set()
    with open(input_cleavages) as f:
        cleavages = {}
        for row in csv.DictReader(f):
            cleavages[(row['from'], row['to'])] = float(row['score'])
            cleavage_epitopes.add(row['from'])
            cleavage_epitopes.add(row['to'])
    LOGGER.info('Loaded %d cleavage scores', len(cleavages))

    # compute edge cost
    edge_cost, vertices, vertices_rewards = [], [], []
    vertex_to_epitope = [''] + list(cleavage_epitopes)
    for ep_from in vertex_to_epitope:
        vertices.append(ep_from)
        vertices_rewards.append(0 if ep_from ==
                                '' else epitopes[ep_from]['immunogen'])
        edge_cost.append([
            cleavages[(ep_from,
                       ep_to)] if ep_from != '' and ep_to != '' else 0.0
            for ep_to in vertex_to_epitope
        ])
    LOGGER.info('Kept %d epitopes with available clevages', len(vertices) - 1)

    type_coverage, min_type_coverage, min_avg_type_conservation = utilities.compute_coverage_matrix(
        [epitopes[e] for e in vertex_to_epitope[1:]], min_alleles,
        min_proteins, min_avg_prot_conservation, min_avg_alle_conservation,
        len(proteins), len(alleles))

    # find optimal design
    solver_build_time = time.time()
    solver = TeamOrienteeringIlp(
        num_teams=cocktail,
        vertex_reward=vertices_rewards,
        edge_cost=edge_cost,
        type_coverage=type_coverage,
        min_type_coverage=min_type_coverage,
        min_avg_type_conservation=min_avg_type_conservation,
        max_edge_cost=max_aminoacids,
        max_vertices=max_epitopes,
        lazy_subtour_elimination=not greedy_subtour)
    solver.build_model()
    solver_start_time = time.time()
    result = solver.solve()
    solver_end_time = time.time()

    # print info and save
    with open(output_vaccine, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))
        for i, mosaic in enumerate(result):
            LOGGER.info('Mosaic #%d', i + 1)
            for j, (_, vertex) in enumerate(mosaic[:-1]):
                epitope = epitopes[vertex_to_epitope[vertex]]
                writer.writerow((i, j, epitope['epitope']))
                LOGGER.info('    %s - IG: %.2f', epitope['epitope'],
                            epitope['immunogen'])

    LOGGER.info('==== Stopwatch')
    LOGGER.info('          Total time : %.2f s',
                solver_end_time - program_start_time)
    LOGGER.info('      Pre-processing : %.2f s',
                solver_build_time - program_start_time)
    LOGGER.info(' Model creation time : %.2f s',
                solver_start_time - solver_build_time)
    LOGGER.info('        Solving time : %.2f s',
                solver_end_time - solver_start_time)