Beispiel #1
0
class TestPeptide(TestCase):
    def setUp(self):
        self.simple = Peptide("SYFPEITHI")

        self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK"
        self.gcg_t1 = Transcript("", transcript_id="GLUC_HUMAN")
        gcg_p1 = Protein(self.gcg_ps,
                         transcript_id='GLUC_HUMAN',
                         orig_transcript=self.gcg_t1)
        self.w_p = Peptide("PROTEIN", {gcg_p1: [0]})
        self.gcg_p1 = gcg_p1
        self.gcg_v1 = Variant(
            "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {
                "GLUC_HUMAN":
                MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")
            }, False, False)
        gcg_p1_copy = copy.deepcopy(gcg_p1)
        gcg_p1_copy.vars = {0: [self.gcg_v1]}
        self.w_v = Peptide("VARIANT", {gcg_p1_copy: [0]})

        #self.gcg_ts = "gcatagaatgcagatgagcaaagtgagtgggagagggaagtcatttgtaacaaaaactcattatttacagatgagaaatttatattgtcagcgtaatatctgtgaggctaaacagagctggagagtatataaaagcagtgcgccttggtgcagaagtacagagcttaggacacagagcacatcaaaagttcccaaagagggcttgctctctcttcacctgctctgttctacagcacactaccagaagacagcagaaatgaaaagcatttactttgtggctggattatttgtaatgctggtacaaggcagctggcaacgttcccttcaagacacagaggagaaatccagatcattctcagcttcccaggcagacccactcagtgatcctgatcagatgaacgaggacaagcgccattcacagggcacattcaccagtgactacagcaagtatctggactccaggcgtgcccaagattttgtgcagtggttgatgaataccaagaggaacaggaataacattgccaaacgtcacgatgaatttgagagacatgctgaagggacctttaccagtgatgtaagttcttatttggaaggccaagctgccaaggaattcattgcttggctggtgaaaggccgaggaaggcgagatttcccagaagaggtcgccattgttgaagaacttggccgcagacatgctgatggttctttctctgatgagatgaacaccattcttgataatcttgccgccagggactttataaactggttgattcagaccaaaatcactgacaggaaataactatatcactattcaagatcatcttcacaacatcacctgctagccacgtgggatgtttgaaatgttaagtcctgtaaatttaagaggtgtattctgaggccacattgctttgcatgccaataaataaattttcttttagtgttgtgtagccaaaaattacaaatggaataaagttttatcaaaatattgctaaaatatcagctttaaaatatgaaagtgctagattctgttattttcttcttattttggatgaagtaccccaacctgtttacatttagcgataaaattatttttctatgatataatttgtaaatgtaaattattccgatctgacatatctgcattataataataggagaatagaagaactggtagccacagtggtgaaattggaaagagaactttcttcctgaaacctttgtcttaaaaatactcagctttcaatgtatcaaagatacaattaaataaaattttcaagcttctttaccattgtct"
        ##gcg_t1 = Transcript(gcg_ts, "NM_002054.4", {344: gcg_v1})
        #gcg_t1 = Transcript(self.gcg_ts, 'GLUC_HUMAN', "NM_002054.4", [self.gcg_v1])
        #self.w_t = Peptide("TRANSCRIPT", {gcg_p1: [0]})

    def test_consistency(self):
        """
        tests all __*__ (including init)
        test has several asserts! If one fails, the following will not be evaluated!
        """
        self.assertTrue(repr(self.simple) == "PEPTIDE:\n SYFPEITHI")
        self.assertTrue(
            repr(self.w_p) ==
            "PEPTIDE:\n PROTEIN\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\nin PROTEIN: GLUC_HUMAN"
        )
        self.assertTrue(
            repr(self.w_v) ==
            "PEPTIDE:\n VARIANT\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\n\tVariant(g.162145588G>T)\nin PROTEIN: GLUC_HUMAN"
        )

    def test_getitem(self):
        self.assertTrue(self.simple[1:3] == 'YF')
        #TODO: document to have variant peptides from Protein with Variants use Generator

    def test_get_all_variants(self):
        self.assertTrue(
            repr(self.w_v.get_variants_by_protein("GLUC_HUMAN")) == repr(
                [self.gcg_v1]))

    def test_get_all_proteins(self):
        self.assertTrue(repr(self.simple.get_all_proteins()) == repr([]))
        self.assertTrue(
            repr(self.w_p.get_all_proteins()) == repr([self.gcg_p1]))

    def test_get_all_transcripts(self):
        self.assertTrue(
            repr(self.w_v.get_all_transcripts()) == repr(
                [Transcript(seq="", transcript_id="GLUC_HUMAN")]))
        self.assertTrue(
            repr(self.w_p.get_all_transcripts()) == repr([self.gcg_t1]))
 def setUp(self):
     self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
     self.peptides_mhcII = [Peptide("AAAAAASYFPEITHI"), Peptide("IHTIEPFYSAAAAAA")]
     self.mhcI = [Allele("HLA-B*07:02"), Allele("HLA-A*02:01")]
     self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
     self.mhcII_combined_alleles = [CombinedAllele("DPA1*01:03-DPB1*01:01"), CombinedAllele("DQA1*06:02-DQB1*06:31")]
     self.transcript = Transcript("")
Beispiel #3
0
 def setUp(self):
     #Peptides of different length 9,10,11,12,13,14,15
     self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
     self.peptides_mhcII = [
         Peptide("SYFPEITHI"),
         Peptide("IHTIEPFYSAAAAAA")
     ]
     self.mhcI = [Allele("HLA-B*15:01"), Allele("HLA-A*02:01")]
     self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
Beispiel #4
0
    def test_simple_assembly(self):
        """
        Simple test if everything works. Solution manually tested for optimality.

        :return:
        """
        pred = CleavageSitePredictorFactory("PCM")
        assembler = EpitopeAssembly(self.peptides, pred, solver="cbc", verbosity=0)
        r = assembler.solve()
        self.assertEqual(r, [Peptide("YLYDHLAPM"), Peptide("ALYDVVSTL"), Peptide("KLLPRLPGV")])
Beispiel #5
0
    def setUp(self):
        self.simple = Peptide("SYFPEITHI")

        self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK"
        self.gcg_t1 = Transcript("", _transcript_id="GLUC_HUMAN")
        gcg_p1 = Protein(self.gcg_ps, _transcript_id='GLUC_HUMAN', _orig_transcript=self.gcg_t1)
        self.w_p = Peptide("PROTEIN", {gcg_p1:[0]})
        self.gcg_p1 = gcg_p1
        self.gcg_v1 = Variant("rs5650", VariationType.SNP, 2, 162145588, 'G', 'T',
                         {"GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")}, False, False)
        gcg_p1_copy = copy.deepcopy(gcg_p1)
        gcg_p1_copy.vars = {0:[self.gcg_v1]}
        self.w_v = Peptide("VARIANT", {gcg_p1_copy:[0]})
def compute_affinities(input_alleles, input_peptides, output_affinities,
                       processes, predictor):
    ''' Computes the binding affinities between the given peptides and HLA alleles
    '''
    alleles = [
        Allele(a.replace('HLA-', ''))
        for a in utilities.get_alleles_and_thresholds(input_alleles).index
    ]
    LOGGER.info('Loaded %d alleles', len(alleles))

    with open(input_peptides) as f:
        reader = csv.DictReader(f)
        peptides = [(Peptide(r['peptide']), len(r['proteins'].split(';')))
                    for r in reader]

    peptides.sort(key=lambda p: p[1], reverse=True)
    LOGGER.info('Loaded %d peptides', len(peptides))

    results = utilities.parallel_apply(
        get_binding_affinity_process,
        ((predictor.lower(), batch, alleles)
         for batch in utilities.batches((p for p, _ in peptides), bsize=256)),
        processes)

    count = 0
    for bindings in results:
        bindings.to_csv(output_affinities,
                        header=(count == 0),
                        mode=('w' if count == 0 else 'a'))
        count += len(bindings)
        LOGGER.debug('Processed %d peptides (%.2f%%)...', count,
                     100 * count / len(peptides))
Beispiel #7
0
class TestPeptide(TestCase):

    def setUp(self):
        self.simple = Peptide("SYFPEITHI")

        self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK"
        self.gcg_t1 = Transcript("", _transcript_id="GLUC_HUMAN")
        gcg_p1 = Protein(self.gcg_ps, _transcript_id='GLUC_HUMAN', _orig_transcript=self.gcg_t1)
        self.w_p = Peptide("PROTEIN", {gcg_p1:[0]})
        self.gcg_p1 = gcg_p1
        self.gcg_v1 = Variant("rs5650", VariationType.SNP, 2, 162145588, 'G', 'T',
                         {"GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")}, False, False)
        gcg_p1_copy = copy.deepcopy(gcg_p1)
        gcg_p1_copy.vars = {0:[self.gcg_v1]}
        self.w_v = Peptide("VARIANT", {gcg_p1_copy:[0]})

        #self.gcg_ts = "gcatagaatgcagatgagcaaagtgagtgggagagggaagtcatttgtaacaaaaactcattatttacagatgagaaatttatattgtcagcgtaatatctgtgaggctaaacagagctggagagtatataaaagcagtgcgccttggtgcagaagtacagagcttaggacacagagcacatcaaaagttcccaaagagggcttgctctctcttcacctgctctgttctacagcacactaccagaagacagcagaaatgaaaagcatttactttgtggctggattatttgtaatgctggtacaaggcagctggcaacgttcccttcaagacacagaggagaaatccagatcattctcagcttcccaggcagacccactcagtgatcctgatcagatgaacgaggacaagcgccattcacagggcacattcaccagtgactacagcaagtatctggactccaggcgtgcccaagattttgtgcagtggttgatgaataccaagaggaacaggaataacattgccaaacgtcacgatgaatttgagagacatgctgaagggacctttaccagtgatgtaagttcttatttggaaggccaagctgccaaggaattcattgcttggctggtgaaaggccgaggaaggcgagatttcccagaagaggtcgccattgttgaagaacttggccgcagacatgctgatggttctttctctgatgagatgaacaccattcttgataatcttgccgccagggactttataaactggttgattcagaccaaaatcactgacaggaaataactatatcactattcaagatcatcttcacaacatcacctgctagccacgtgggatgtttgaaatgttaagtcctgtaaatttaagaggtgtattctgaggccacattgctttgcatgccaataaataaattttcttttagtgttgtgtagccaaaaattacaaatggaataaagttttatcaaaatattgctaaaatatcagctttaaaatatgaaagtgctagattctgttattttcttcttattttggatgaagtaccccaacctgtttacatttagcgataaaattatttttctatgatataatttgtaaatgtaaattattccgatctgacatatctgcattataataataggagaatagaagaactggtagccacagtggtgaaattggaaagagaactttcttcctgaaacctttgtcttaaaaatactcagctttcaatgtatcaaagatacaattaaataaaattttcaagcttctttaccattgtct"
        ##gcg_t1 = Transcript(gcg_ts, "NM_002054.4", {344: gcg_v1})
        #gcg_t1 = Transcript(self.gcg_ts, 'GLUC_HUMAN', "NM_002054.4", [self.gcg_v1])
        #self.w_t = Peptide("TRANSCRIPT", {gcg_p1: [0]})

    def test_consistency(self):
        """
        tests all __*__ (including init)
        test has several asserts! If one fails, the following will not be evaluated!
        """
        self.assertTrue(repr(self.simple) == "PEPTIDE:\n SYFPEITHI")
        self.assertTrue(repr(self.w_p) == "PEPTIDE:\n PROTEIN\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\nin PROTEIN: GLUC_HUMAN")
        self.assertTrue(repr(self.w_v) == "PEPTIDE:\n VARIANT\nin TRANSCRIPT: GLUC_HUMAN\n\tVARIANTS:\n\tVariant(g.162145588G>T)\nin PROTEIN: GLUC_HUMAN")

    def test_getitem(self):
        self.assertTrue(self.simple[1:3] == 'YF')
        #TODO: document to have variant peptides from Protein with Variants use Generator

    def test_get_all_variants(self):
        self.assertTrue(repr(self.w_v.get_variants_by_protein("GLUC_HUMAN")) == repr([self.gcg_v1]))

    def test_get_all_proteins(self):
        self.assertTrue(repr(self.simple.get_all_proteins()) == repr([]))
        self.assertTrue(repr(self.w_p.get_all_proteins()) == repr([self.gcg_p1]))

    def test_get_all_transcripts(self):
        self.assertTrue(repr(self.w_v.get_all_transcripts()) == repr([Transcript(_seq="", _transcript_id="GLUC_HUMAN")]))
        self.assertTrue(repr(self.w_p.get_all_transcripts()) == repr([self.gcg_t1]))
Beispiel #8
0
def read_lines(file):
    peptides = []

    with open(file, "r") as f:
        for l in f:
            if not l.startswith("#") and l.strip() != "" and not l.startswith(
                    "Epitope") and not l.startswith("Sequence"):
                #print l, l.split()
                pep = l.split()[0].strip()
                peptides.append(Peptide(pep))
    return peptides
def optitope(input_affinities, input_peptides, input_alleles, output_vaccine,
             epitopes, min_alleles, min_proteins):
    with open(input_peptides) as f:
        reader = csv.DictReader(f)
        peptides = {
            # we don't really need the actual protein sequence, just fill it with the id to make it unique
            Peptide(r['peptide']):
            set(Protein(gid, gene_id=gid) for gid in r['proteins'].split(';'))
            for r in reader
        }
    LOGGER.info('Loaded %d peptides', len(peptides))

    allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict(
        'index')
    thresholds = {
        allele.replace('HLA-', ''): data['threshold']
        for allele, data in allele_data.items()
    }
    LOGGER.info('Loaded %d alleles', len(thresholds))

    affinities = utilities.affinities_from_csv(input_affinities,
                                               allele_data,
                                               peptide_coverage=peptides)
    LOGGER.info('Loaded %d affinities', len(affinities))

    LOGGER.info("Creating vaccine...")
    model = OptiTope(affinities, thresholds, k=epitopes, solver='gurobi')
    if min_alleles > 0:
        model.activate_allele_coverage_const(min_alleles)
        LOGGER.info('Vaccine will cover at least %f alleles', min_alleles)
    if min_proteins > 0:
        model.activate_antigen_coverage_const(min_proteins)
        LOGGER.info('Vaccine will cover at least %f proteins', min_proteins)
    vaccine = model.solve()

    LOGGER.info('Vaccine summary:')
    with open(output_vaccine, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))
        total_ig = 0.0
        for i, epitope in enumerate(vaccine):
            writer.writerow((0, i, epitope))
            epitope_immunog = sum(model.instance.p[a] *
                                  model.instance.i[epitope, a]
                                  for a in model.instance.A)
            total_ig += epitope_immunog
            LOGGER.info('    %s - %.2f', epitope, epitope_immunog)
        LOGGER.info('Total immunogenicity: %.2f', total_ig)
def get_cleavage_score_process(penalty, cleavage_model, window_size, epitopes):
    #predictor = CleavageSitePredictorFactory(cleavage_model)
    assert cleavage_model.lower() == 'pcm'
    from Fred2.CleavagePrediction import PCM
    predictor = PCM()

    results = []
    for ep_from, ep_to in epitopes:
        preds = predictor.predict(Peptide(ep_from + ep_to))
        score = 0.0
        join_pos = len(ep_from) - 1
        half_size = int((window_size - 1) / 2)
        for i, (_, lik) in enumerate(preds.values):
            if i - half_size <= join_pos <= i + half_size:
                weight = -1 if i == join_pos else penalty
                score += weight * lik
        results.append((ep_from, ep_to, score))
    return results
def append_score(dt2, alleles):
    """
    Given a choped sequence (output from sliding_window()),
    append the immunogenicity scores
    """
    peptides_to_compute = [
        Peptide(peptide) for peptide in set(list(dt2["MT"]) + list(dt2["WT"]))
    ]
    res = fred2wrap.predict_peptide_effects(peptides_to_compute, alleles)
    res["peptide"] = [str(peptide) for peptide in res["peptide"]]

    full = pd.merge(dt2, res, how='left', left_on="WT", right_on="peptide")
    full = full.rename(columns={'score': 'WT_score'})
    del full["peptide"]
    full = pd.merge(full,
                    res,
                    how='left',
                    left_on=["MT", "method", "allele"],
                    right_on=["peptide", "method", "allele"])
    full = full.rename(columns={'score': 'MT_score'})
    del full["peptide"]
    return full
Beispiel #12
0
def generate_epitope_result(input, allele_file):
    """
    generates EpitopePredictionResult from output of epitopeprediction and neoepitopeprediction
    """
    #first generate alleles in allele file
    alleles = {}
    with open(allele_file, "r") as af:
        for l in af:
            allele, freq = l.split("\t")
            alleles[allele] = Allele(allele, prob=float(freq))

    r_raw = pandas.read_csv(input, sep="\t")
    res_dic = {}
    method = r_raw.loc[0, "Method"]
    columns = set(["Sequence", "Method", "Antigen ID", "Variant"])
    alleles_raw = [c for c in r_raw.columns if c not in columns]
    for k, row in r_raw.iterrows():
        seq = row["Sequence"]
        protPos = collections.defaultdict(list)
        try:
            protPos = {Protein(p, gene_id=p, transcript_id=p): [0] for p in str(row["Antigen ID"]).split(",")}
        except KeyError:
            pass
        pep = Peptide(seq, protein_pos=protPos)
        for a in alleles_raw:
            if a in alleles:
                if alleles[a] not in res_dic:
                    res_dic[alleles[a]] = {}
                res_dic[alleles[a]][pep] = float(row[a])

    if not res_dic:
        sys.stderr.write("HLA alleles of population and HLA used for prediction did not overlap.")
        sys.exit(-1)

    df_result = EpitopePredictionResult.from_dict(res_dic)
    df_result.index = pandas.MultiIndex.from_tuples([tuple((i, method)) for i in df_result.index],
                                                        names=['Seq', 'Method'])
    return df_result, method
Beispiel #13
0
    def setUp(self):
        self.simple = Peptide("SYFPEITHI")

        self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK"
        self.gcg_t1 = Transcript("", transcript_id="GLUC_HUMAN")
        gcg_p1 = Protein(self.gcg_ps,
                         transcript_id='GLUC_HUMAN',
                         orig_transcript=self.gcg_t1)
        self.w_p = Peptide("PROTEIN", {gcg_p1: [0]})
        self.gcg_p1 = gcg_p1
        self.gcg_v1 = Variant(
            "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {
                "GLUC_HUMAN":
                MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")
            }, False, False)
        gcg_p1_copy = copy.deepcopy(gcg_p1)
        gcg_p1_copy.vars = {0: [self.gcg_v1]}
        self.w_v = Peptide("VARIANT", {gcg_p1_copy: [0]})
def popcover(input_peptides, input_affinities, input_alleles, output_vaccine,
             processes, epitopes):
    with open(input_peptides) as f:
        reader = csv.DictReader(f)
        peptides = {
            Peptide(r['peptide']): set(r['proteins'].split(';'))
            for r in reader
        }
    LOGGER.info('Loaded %d peptides', len(peptides))

    allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict(
        'index')
    thresholds = {
        allele.replace('HLA-', ''): data['threshold']
        for allele, data in allele_data.items()
    }
    LOGGER.info('Loaded %d alleles', len(thresholds))

    affinities = utilities.affinities_from_csv(input_affinities, allele_data,
                                               peptides)
    LOGGER.info('Loaded %d affinities', len(affinities))

    LOGGER.info("Creating vaccine...")
    model = PopCover(affinities,
                     thresholds,
                     k=epitopes,
                     processes=processes if processes > 0 else
                     (mp.cpu_count() + processes))
    vaccine = model.solve()

    with open(output_vaccine, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))
        for i, epitope in enumerate(vaccine):
            writer.writerow((0, i, epitope))
            LOGGER.info('    %s', epitope)
def read_epitope_input(args, alleles, exclude):
    """
    reads in epitope files generated by NGSAnalyzer+ImmogenicityPredictor

    Header NGSAnalyzer:
        mutation - position of the mutation in the reference genome (currently hg19); format:
                   chromosome_position; the position is zero-based
        gene - gene affected by the mutation
        transcript - transcript affected by the mutation (UCSC known genes transcript ID)
        transcript_expression - expression in RPKM/FPKM of the affected transcript
        neopeptide - peptide resulting from the mutation in the given transcript
        length_of_neopeptide - length of the neopeptide
        HLA - HLA used for the binding prediction of the neopeptide
        HLA_class1_binding_prediction - predicted binding affinity (currently rank score of IEDB consensus tool)

    Header ImmunogenicityPredictor:
        immunogenicity - predicted immunogenicity for specific HLA allele in column
        distance - distance-to-self estimation specific for HLA allele in column
        [uncertainty] - If immunopredictor can estimate prediction uncertainty

    :param args: Input arguments
    :param alleles: HLA alleles
    :param exlude: excluded peptides
    :return: df_epitope - EpitopePredictionResult
             distance - dict(pep_string, float)
             expression - dict(gene_id, float)
             uncertainty - dict(pep_string, float)
             pep_to_mutation - dict(pep_string, mutation_string)
    """
    distance = {}
    uncertainty = {}
    expression = {}
    seq_to_pep = {}
    gene_to_prot = {}
    hla_to_allele = {a.name: a for a in alleles}
    df_pred = {a: {} for a in alleles}
    pep_to_mutation = {}

    with open(args.input, "rU") as f:
        reader = csv.DictReader(f, delimiter='\t')

        for row in reader:
            seq = row["neopeptide"]
            if (exclude is None or seq not in exclude) and len(seq) > 0:
                if seq in seq_to_pep:
                    pep = seq_to_pep[seq]
                else:
                    pep = Peptide(seq.upper())
                    seq_to_pep[seq] = pep
                try:
                    allele = hla_to_allele[row["HLA"].replace("HLA-", "")]
                except:
                    logging.warning(
                        "HLA {allele} was not contained in the provided allele file. Please check your input."
                        .format(allele=row["HLA"]))
                    continue
                gene = row["gene"]
                if gene in gene_to_prot:
                    prot = gene_to_prot[gene]
                else:
                    prot = Protein("", gene_id=gene, transcript_id=gene)
                    gene_to_prot[gene] = prot

                pep.proteins[prot.transcript_id] = prot
                pep.proteinPos[prot.transcript_id].append(0)
                pep_to_mutation.setdefault(seq, []).append(row["mutation"])
                expression.setdefault(gene, []).append(
                    float(row["transcript_expression"]))

                if args.rank:
                    df_pred[allele][pep] = max(
                        0., 1. - float(row[args.immunogenicity]) / 100.0)
                else:
                    df_pred[allele][pep] = max(
                        0., 1. -
                        math.log(float(row[args.immunogenicity]), 50000)
                    ) if args.immunogenicity == "HLA_class1_binding_prediction" else float(
                        row[args.immunogenicity])

                if args.distance is not None:
                    distance[(seq, allele.name)] = float(row[args.distance])

                if args.uncertainty is not None:
                    uncertainty[(seq,
                                 allele.name)] = float(row[args.uncertainty])

                if args.taa is not None:
                    pep.log_metadata("taa", row[args.taa].upper() == "TAA")

    expression = {k: max(v) for k, v in expression.iteritems()}
    df_result = EpitopePredictionResult.from_dict(df_pred)
    df_result.index = pandas.MultiIndex.from_tuples(
        [tuple((i, "custom")) for i in df_result.index],
        names=['Seq', 'Method'])

    return df_result, distance, expression, uncertainty, pep_to_mutation
def toplevel_predictor(x):
    predictor = EpitopePredictorFactory("netMHC", version="3.4")
    peps = [Peptide(i) for i in x]
    return predictor.predict(peps)
Beispiel #17
0
 def setUp(self):
     self.peptides = [
         Peptide("KLLPRLPGV"),
         Peptide("YLYDHLAPM"),
         Peptide("ALYDVVSTL")
     ]
Beispiel #18
0
    def solve(self, start=0, threads=None, options=None):
        """
        Solve the epitope assembly problem with spacers optimally using integer linear programming.

        .. note::

            This can take quite long and should not be done for more and 30 epitopes max!
            Also, one has to disable pre-solving steps in order to use this model.

        :param int start: Start length for spacers (default 0).
        :param int threads: Number of threads used for spacer design.
                            Be careful, if options contain solver threads it will allocate threads*solver_threads cores!
        :param dict(str,str) options: Solver specific options as keys and parameters as values
        :return: A list of ordered :class:`~Fred2.Core.Peptide.Peptide`
        :rtype: list(:class:`~Fred2.Core.Peptide.Peptide`)
        """
        def __load_model(name, model):
            return getattr(
                __import__("Fred2.Data.pssms." + name + ".mat." + model,
                           fromlist=[model]), model)

        options = dict() if options is None else options
        threads = mp.cpu_count() if threads is None else threads
        pool = mp.Pool(threads)

        #prepare parameters
        cn = min(self.__clev_pred.supportedLength)
        cl_pssm = __load_model(self.__clev_pred.name,
                               self.__clev_pred.name + "_" + str(cn))
        cleav_pos = self.__clev_pred.cleavagePos
        en = self.__en
        epi_pssms = {}
        allele_prob = {}
        delete_alleles = []
        if self.__epi_pred.name in ["smm", "smmpmbec", "comblibsidney"]:
            self.__thresh = {
                k: (1 - math.log(v, 50000) if v != 0 else 0)
                for k, v in self.__thresh.items()
            }
        for a in self.__alleles:
            allele_prob[a.name] = a.prob
            try:
                pssm = __load_model(
                    self.__epi_pred.name,
                    "%s_%i" % (self.__epi_pred.convert_alleles([a])[0], en))
                if self.__epi_pred.name in [
                        "smm", "smmpmbec", "comblibsidney"
                ]:
                    for j, v in pssm.items():
                        for aa, score in v.items():
                            epi_pssms[j, aa, a.name] = 1 / 10. - math.log(
                                math.pow(10, score), 50000)
                else:
                    for j, v in pssm.items():
                        for aa, score in v.items():
                            epi_pssms[j, aa, a.name] = score
            except ImportError:
                delete_alleles.append(a)

        #delete alleles from model that generated an error while loading matrices
        for a in delete_alleles:
            del allele_prob[a.name]
            del self.__thresh[a.name]

        if not epi_pssms:
            raise ValueError(
                "Selected alleles with epitope length are not supported by the prediction method."
            )

        #print "run spacer designs in parallel using multiprocessing"
        res = pool.map(
            _runs_lexmin,
            ((str(ei), str(ej), i, en, cn, cl_pssm, epi_pssms, cleav_pos,
              allele_prob, self.__alpha, self.__thresh, self.__solver,
              self.__beta, options) for i in range(start, self.__k + 1)
             for ei, ej in itr.product(self.__peptides, repeat=2) if ei != ej))
        pool.close()
        pool.join()

        opt_spacer = {}
        adj_matrix = {}
        inf = float("inf")
        #print res
        #print "find best scoring spacer for each epitope pair"
        for ei, ej, score, epi, spacer, c1, c2, non_c in res:
            #print ei,spacer,ej,min(c1,c2),c1,c2
            if adj_matrix.get((ei, ej), inf) > -min(c1, c2):
                adj_matrix[(ei, ej)] = -min(c1, c2)
                opt_spacer[(ei, ej)] = spacer

        self.spacer = opt_spacer
        #print "solve assembly with generated adjacency matrix"
        assembler = EpitopeAssembly(self.__peptides,
                                    self.__clev_pred,
                                    solver=self.__solver,
                                    matrix=adj_matrix)
        res = assembler.solve(options=options)

        #generate output
        sob = []
        for i in range(len(res) - 1):
            ei = str(res[i])
            ej = str(res[i + 1])
            if not i:
                sob.append(Peptide(ei))
            sob.append(Peptide(opt_spacer[ei, ej]))
            sob.append(Peptide(ej))
        return sob
Beispiel #19
0
    def __init__(self,
                 peptides,
                 pred,
                 solver="glpk",
                 weight=0.0,
                 matrix=None,
                 verbosity=0):

        if not isinstance(pred, ACleavageSitePrediction):
            raise ValueError(
                "Cleave site predictor must be of type ACleavageSitePrediction"
            )

        if len(peptides) > 60:
            warnings.warn(
                "The peptide set exceeds 60. Above this level one has to expect "
                +
                "considerably long running times due to the complexity of the problem."
            )

        #Generate model
        #1. Generate peptides for which cleave sites have to be predicted
        #2. generate graph with dummy element
        self.__verbosity = verbosity

        pep_tmp = peptides[:]
        pep_tmp.append("Dummy")
        edge_matrix = {}
        fragments = {}
        seq_to_pep = {}
        self.neo_cleavage = {}
        self.good_cleavage = {}

        if matrix is None:
            for start, stop in itr.combinations(pep_tmp, 2):
                if start == "Dummy" or stop == "Dummy":
                    seq_to_pep[str(start)] = start
                    seq_to_pep[str(stop)] = stop
                    edge_matrix[(str(start), str(stop))] = 0
                    edge_matrix[(str(stop), str(start))] = 0
                else:
                    start_str = str(start)
                    stop_str = str(stop)
                    frag = Peptide(start_str + stop_str)
                    garf = Peptide(stop_str + start_str)

                    fragments[frag] = (start_str, stop_str)
                    fragments[garf] = (stop_str, start_str)

            cleave_pred = pred.predict(list(fragments.keys()))
            #cleave_site_df = cleave_pred.xs((slice(None), (cleavage_pos-1)))
            for i in set(cleave_pred.index.get_level_values(0)):
                fragment = "".join(cleave_pred.ix[i]["Seq"])
                start, stop = fragments[fragment]

                cleav_pos = len(str(start)) - 1
                edge_matrix[(start, stop)] = -1.0 * (
                    cleave_pred.loc[(i, len(str(start)) - 1), pred.name] -
                    weight * sum(cleave_pred.loc[(i, j), pred.name]
                                 for j in range(cleav_pos - 1, cleav_pos +
                                                4, 1) if j != cleav_pos))

                self.neo_cleavage[(start, stop)] = sum(
                    cleave_pred.loc[(i, j), pred.name]
                    for j in range(cleav_pos - 1, cleav_pos + 4, 1)
                    if j != cleav_pos)
                self.good_cleavage[(start,
                                    stop)] = cleave_pred.loc[(i,
                                                              len(str(start)) -
                                                              1), pred.name]
        else:
            edge_matrix = matrix
            seq_to_pep = {str(p): p for p in pep_tmp}
            for p in seq_to_pep.keys():
                if p != "Dummy":
                    edge_matrix[(p, "Dummy")] = 0
                    edge_matrix[("Dummy", p)] = 0
        self.__seq_to_pep = seq_to_pep

        #3. initialize ILP
        self.__solver = SolverFactory(solver)
        model = ConcreteModel()

        E = [x for x in list(seq_to_pep.keys()) if x != "Dummy"]
        model.E = Set(initialize=E)
        model.E_prime = Set(initialize=list(seq_to_pep.keys()))
        model.ExE = Set(initialize=itr.permutations(E, 2), dimen=2)

        model.w_ab = Param(model.E_prime,
                           model.E_prime,
                           initialize=edge_matrix)
        model.card = Param(initialize=len(model.E_prime))

        model.x = Var(model.E_prime, model.E_prime, within=Binary)
        model.u = Var(model.E, domain=PositiveIntegers, bounds=(2, model.card))

        model.obj = Objective(
            rule=lambda mode: sum(model.w_ab[a, b] * model.x[a, b]
                                  for a in model.E_prime for b in model.E_prime
                                  if a != b),
            sense=minimize)

        model.tour_constraint_1 = Constraint(
            model.E_prime,
            rule=lambda model, a: sum(model.x[a, b] for b in model.E_prime
                                      if a != b) == 1)
        model.tour_constraint_2 = Constraint(
            model.E_prime,
            rule=lambda model, a: sum(model.x[b, a] for b in model.E_prime
                                      if a != b) == 1)
        model.cardinality_constraint = Constraint(
            model.ExE,
            rule=lambda model, a, b: model.u[a] - model.u[b] + 1 <=
            (model.card - 1) * (1 - model.x[a, b]))

        self.instance = model
        if self.__verbosity > 0:
            print("MODEL INSTANCE")
            self.instance.pprint()
Beispiel #20
0
    def approximate(self, start=0, threads=1, options=None):
        """
        Approximates the Eptiope Assembly problem by applying Lin-Kernighan traveling salesman heuristic

        LKH implementation must be downloaded, compiled, and globally executable.

        Source code can be found here:
        http://www.akira.ruc.dk/~keld/research/LKH/

        :param int start: Start length for spacers (default 0).
        :param int threads: Number of threads used for spacer design. Be careful, if options contain solver threads it
                            will allocate threads*solver_threads cores!
        :param dict(str,str) options: Solver specific options (threads for example)
        :return: A list of ordered :class:`~Fred2.Core.Peptide.Peptide`
        :rtype: list(:class:`~Fred2.Core.Peptide.Peptide`)
        """
        def __load_model(name, model):
            return getattr(
                __import__("Fred2.Data.pssms." + name + ".mat." + model,
                           fromlist=[model]), model)

        options = dict() if options is None else options
        threads = mp.cpu_count() if threads is None else threads
        pool = mp.Pool(threads)

        # prepare parameters
        cn = min(self.__clev_pred.supportedLength)
        cl_pssm = __load_model(self.__clev_pred.name,
                               self.__clev_pred.name + "_" + str(cn))
        cleav_pos = self.__clev_pred.cleavagePos
        en = self.__en
        epi_pssms = {}
        allele_prob = {}
        delete_alleles = []

        if self.__epi_pred.name in ["smm", "smmpmbec", "comblibsidney"]:
            self.__thresh = {
                k: (1 - math.log(v, 50000) if v != 0 else 0)
                for k, v in self.__thresh.items()
            }
        for a in self.__alleles:
            allele_prob[a.name] = a.prob
            try:
                pssm = __load_model(
                    self.__epi_pred.name,
                    "%s_%i" % (self.__epi_pred.convert_alleles([a])[0], en))
                if self.__epi_pred.name in [
                        "smm", "smmpmbec", "comblibsidney"
                ]:
                    for j, v in pssm.items():
                        for aa, score in v.items():
                            epi_pssms[j, aa, a.name] = 1 / 10. - math.log(
                                math.pow(10, score), 50000)
                else:
                    for j, v in pssm.items():
                        for aa, score in v.items():
                            epi_pssms[j, aa, a.name] = score
            except ImportError:
                delete_alleles.append(a)

        # delete alleles from model that generated an error while loading matrices
        for a in delete_alleles:
            del allele_prob[a.name]
            del self.__thresh[a.name]

        if not epi_pssms:
            raise ValueError(
                "Selected alleles with epitope length are not supported by the prediction method."
            )

        # print "run spacer designs in parallel using multiprocessing"
        res = pool.map(
            _runs_lexmin,
            ((str(ei), str(ej), i, en, cn, cl_pssm, epi_pssms, cleav_pos,
              allele_prob, self.__alpha, self.__thresh, self.__solver,
              self.__beta, options) for i in range(start, self.__k + 1)
             for ei, ej in itr.product(self.__peptides, repeat=2) if ei != ej))
        pool.close()
        pool.join()

        opt_spacer = {}
        adj_matrix = {}
        inf = float("inf")
        # print res
        # print "find best scoring spacer for each epitope pair"
        for ei, ej, score, epi, spacer, c1, c2, non_c in res:
            if adj_matrix.get((ei, ej), inf) > -min(c1, c2):
                adj_matrix[(ei, ej)] = -min(c1, c2)
                opt_spacer[(ei, ej)] = spacer

        self.spacer = opt_spacer
        #print "solve assembly with generated adjacency matrix"
        assembler = EpitopeAssembly(self.__peptides,
                                    self.__clev_pred,
                                    solver=self.__solver,
                                    matrix=adj_matrix)
        res = assembler.approximate()

        #generate output
        sob = []
        for i in range(len(res) - 1):
            ei = str(res[i])
            ej = str(res[i + 1])
            if not i:
                sob.append(Peptide(ei))
            sob.append(Peptide(opt_spacer[ei, ej]))
            sob.append(Peptide(ej))
        return sob
import pandas as pd
from docopt import docopt

if __name__ == "__main__":
    arguments = docopt(__doc__)

    file_in = arguments["--input"]
    if not file_in:
        file_in = "./data/binders.csv"
    file_out = arguments["--output"]

    dt = pd.read_csv(file_in)
    dt = dt[dt["Sequence"].notnull()]
    dt = dt[dt["Sequence"].str.len() == 9]

    peptides = [Peptide(peptide) for peptide in dt["Sequence"]]

    dt["allele"] = dt["allele"].str.replace("\*","").\
                   str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})","\\1*\\2:\\3").\
                   str.replace("w","").\
                   str.replace("HLA-","")
    dt.rename(columns={"Sequence": "peptide"}, inplace=True)

    alleles = [Allele(allele) for allele in dt["allele"].unique().tolist()]
    res = fred2wrap.predict_peptide_effects(
        peptides, alleles=dt["allele"].unique().tolist())
    res["peptide"] = [peptide.tostring() for peptide in res["peptide"]]
    res["allele"] = [str(allele) for allele in res["allele"]]

    res = res.pivot_table(index=["peptide", "allele"],
                          columns='method',
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser(
        description=
        """The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i",
                        "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)")
    parser.add_argument(
        "-a",
        "--alleles",
        required=True,
        help=
        "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument(
        "-k",
        "--max_length",
        default=6,
        type=int,
        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument(
        "-al",
        "--alpha",
        default=0.99,
        type=float,
        help=
        "Specifies the first-order preference of the user in the model [0,1] (default 0.99)"
    )
    parser.add_argument(
        "-be",
        "--beta",
        default=0.0,
        type=float,
        help=
        "Specifies the second-order preference of the user in the model [0,1] (default 0)."
    )

    parser.add_argument(
        "-cp",
        "--cleavage_prediction",
        default="PCM",
        help=
        "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument(
        "-ep",
        "--epitope_prediction",
        default="Syfpeithi",
        help=
        "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument(
        "-thr",
        "--threshold",
        default=20,
        type=float,
        help=
        "Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument(
        "-t",
        "--threads",
        type=int,
        default=None,
        help=
        "Specifies number of threads. If not specified all available logical cpus are used."
    )

    parser.add_argument(
        "--ips-solver",
        default="cplex",
        choices=["cplex", "cbc"],
        help=
        "Executable name of the IPS solver. Executable needs to be available in PATH."
    )

    parser.add_argument("--tsp-solution",
                        default="approximate",
                        choices=["approximate", "optimal"],
                        help="Type of solution of the TSP")

    parser.add_argument(
        "--random-order",
        action="store_true",
        help=
        "Indicate whether to generate a random ordered string-of-beads polypeptide"
    )

    parser.add_argument(
        "--seed",
        type=int,
        default=1,
        help="Seed for random ordering of string-of-beads polypeptide")

    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in [
            "PCM", "PROTEASMM_C", "PROTEASMM_S"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in [
            "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name: args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,
                                       cl_pred,
                                       epi_pred,
                                       alleles,
                                       k=args.max_length,
                                       en=9,
                                       threshold=thr,
                                       solver=args.ips_solver,
                                       alpha=args.alpha,
                                       beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads

    if args.tsp_solution == "approximate":
        svbws = solver.approximate(threads=threads,
                                   options={
                                       "preprocessing_presolve": "n",
                                       "threads": 1
                                   })
    else:
        svbws = solver.solve(threads=threads,
                             options={
                                 "preprocessing_presolve": "n",
                                 "threads": 1
                             })

    # Generate random ordered string-of-breads, but still uses optimal spacers
    # determined from the above solve function.
    if args.random_order:
        print "Generating a randomly ordered polypeptide"
        random.seed(args.seed)
        random_order_sob = []
        random.shuffle(peptides)
        for i in range(len(peptides)):

            # Break from loop once we hit the last peptide
            if i == len(peptides) - 1:
                random_order_sob.extend([Peptide(str(peptides[i]))])
                break

            left_peptide = str(peptides[i])
            right_peptide = str(peptides[i + 1])
            opt_spacer = solver.spacer[(left_peptide, right_peptide)]

            # Right peptide gets added in the next iteration
            random_order_sob.extend(
                [Peptide(left_peptide),
                 Peptide(opt_spacer)])

        svbws = random_order_sob

    print
    print "Resulting String-of-Beads: ", "-".join(map(str, svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str, svbws)))
Beispiel #23
0
if __name__ == "__main__":
    arguments = docopt(__doc__)

    file_in = arguments["--input"]
    if not file_in:
        file_in = os.path.expanduser("data/immunogenic_SNVs-training_sets.csv")
    file_out = arguments["--output"]
    if not file_out:
        file_out = os.path.expanduser("data/immunogenic_SNVs-model_data.csv")

    dt = pd.read_csv(file_in)
    dt = dt[dt["mutant_sequence"].notnull() & dt["wt_sequence"].notnull()]
    # dt = dt[dt["Sequence"].str.len() == 9]

    all_peptides = dt["mutant_sequence"].append(dt["wt_sequence"]).unique()
    peptides = [Peptide(peptide) for peptide in all_peptides]

    dt["allele"] = dt["allele"].str.replace("\*", "").\
        str.replace(":", "").\
        str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})", "\\1*\\2:\\3").\
        str.replace("w", "").\
        str.replace("HLA-", "")

    # TODO
    # dt.rename(columns = {"Sequence": "peptide"}, inplace = True)
    alleles = []
    valid_alleles = []
    for allele in dt["allele"].tolist():
        try:
            a = Allele(allele)
            valid_alleles.append(True)
Beispiel #24
0
parser.add_argument('alleles',
                    metavar='allele',
                    nargs='+',
                    type=str,
                    help='HLA alleles to predict against.')
args = parser.parse_args()

####################################################################################################

from Fred2.Core import Allele, Peptide
from Fred2.EpitopePrediction import EpitopePredictorFactory

####################################################################################################

# Convert raw peptide sequences to Fred2.Core.Peptide objects
all_peptides = [Peptide(row.strip()) for row in args.peptides]

# Separate peptides by length
peptides_by_length = {}
for peptide in all_peptides:
    if not len(peptide) in peptides_by_length:
        peptides_by_length[len(peptide)] = []
    peptides_by_length[len(peptide)].append(peptide)

# Convert raw allele strings to Fred2.Core.Allele objects
alleles = [Allele(allele) for allele in args.alleles]

# Instatiate predictor
predictor = EpitopePredictorFactory("Syfpeithi")

def read_epitope_input(args, alleles, exclude):
    """
    reads in epitope files generated by NGSAnalyzer+ImmogenicityPredictor

    Header NGSAnalyzer:
        mutation - position of the mutation in the reference genome (currently hg19); format:
                   chromosome_position; the position is zero-based
        gene - gene affected by the mutation
        transcript - transcript affected by the mutation (UCSC known genes transcript ID)
        transcript_expression - expression in RPKM/FPKM of the affected transcript
        neopeptide - peptide resulting from the mutation in the given transcript
        length_of_neopeptide - length of the neopeptide
        HLA - HLA used for the binding prediction of the neopeptide
        HLA_class1_binding_prediction - predicted binding affinity (currently rank score of IEDB consensus tool)

    Header ImmunogenicityPredictor:
        immunogenicity - predicted immunogenicity for specific HLA allele in column
        distance - distance-to-self estimation specific for HLA allele in column
        [uncertainty] - If immunopredictor can estimate prediction uncertainty

    :param args: Input arguments
    :param alleles: HLA alleles
    :param exlude: excluded peptides
    :return: df_epitope - EpitopePredictionResult
             distance - dict(pep_string, float)
             expression - dict(gene_id, float)
             uncertainty - dict(pep_string, float)
             pep_to_mutation - dict(pep_string, mutation_string)
    """
    distance = {}
    uncertainty = {}
    expression = {}
    seq_to_pep = {}
    gene_to_prot = {}
    hla_to_allele = {a.name:a for a in alleles}
    df_pred = {a:{} for a in alleles}
    pep_to_mutation = {}

    with open(args.input, "rU") as f:
        reader = csv.DictReader(f, delimiter='\t')

        for row in reader:
            seq = row["neopeptide"]
            if (exclude is None or seq not in exclude) and len(seq) > 0:
                if seq in seq_to_pep:
                    pep = seq_to_pep[seq]
                else:
                    pep = Peptide(seq.upper())
                    seq_to_pep[seq] = pep
                try:
                	allele = hla_to_allele[row["HLA"].replace("HLA-","")]
                except:
                    logging.warning(
                            "HLA {allele} was not contained in the provided allele file. Please check your input.".format(
                                allele=row["HLA"]))
                    continue
                gene = row["gene"]
                if gene in gene_to_prot:
                    prot = gene_to_prot[gene]
                else:
                    prot = Protein("", gene_id=gene, transcript_id=gene)
                    gene_to_prot[gene] = prot

                pep.proteins[prot.transcript_id]=prot
                pep.proteinPos[prot.transcript_id].append(0)
                pep_to_mutation.setdefault(seq, []).append(row["mutation"])
                expression.setdefault(gene,[]).append(float(row["transcript_expression"]))

                if args.rank:
                	df_pred[allele][pep] = max(0., 1.
                                            - float(row[args.immunogenicity])/100.0)
                else:
                	df_pred[allele][pep] = max(0., 1.
                                            - math.log(float(row[args.immunogenicity]),
                                                50000)) if args.immunogenicity == "HLA_class1_binding_prediction" else float(
                        row[args.immunogenicity])

                if args.distance is not None:
                    distance[(seq,allele.name)] = float(row[args.distance])

                if args.uncertainty is not None:
                    uncertainty[(seq,allele.name)] = float(row[args.uncertainty])

                if args.taa is not None:
                    pep.log_metadata("taa", row[args.taa].upper() == "TAA" )

    expression = {k:max(v) for k,v in expression.iteritems()}
    df_result = EpitopePredictionResult.from_dict(df_pred)
    df_result.index = pandas.MultiIndex.from_tuples([tuple((i, "custom")) for i in df_result.index],
                                                        names=['Seq', 'Method'])

    return df_result, distance, expression, uncertainty, pep_to_mutation
 def setUp(self):
     self.seqs = [Peptide("SYFPEISYFP"),
                  Protein("IHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYS", transcript_id="ID-01", gene_id="FOXP3")]
     self.transcript = Transcript("")
Beispiel #27
0
 """)

_imm, _non = set(), set()
if args.dataset == 'iedb.tcell':
    _imm, _non = pepdata.iedb.tcell.load_classes(nrows=args.n)
elif args.dataset == 'iedb.mhc':
    _imm, _non = pepdata.iedb.mhc.load_classes(nrows=args.n)
elif args.dataset == 'imma2':
    _imm, _non = pepdata.imma2.load_classes()
elif args.dataset == 'stdin':
    _imm = [line for line in stdin]
else:
    print("available datasets: iedb.tcell, ideb.mhc, imma2")
    exit()

imm = [Peptide(elem) for elem in _imm]
non = [Peptide(elem) for elem in _non]

#hla='HLA-A2|HLA-A\*02'
#df = ()
#
#if args.dataset == 'iedb.tcell':
#	df =  pepdata.iedb.tcell.load_dataframe(nrows=args.n)
#elif args.dataset == 'iedb.mhc':
#	df =  pepdata.iedb.mhc.load_dataframe(nrows=args.n)
#elif args.dataset == 'iedb.bcell':
#	df =  pepdata.iedb.bcell.load_dataframe(nrows=args.n)
#elif args.dataset == 'stdin':
#	df = [ line for line in stdin ]
#else:
#	print("available datasets: iedb.tcell, ideb.mhc, imma2")