Ejemplo n.º 1
0
 def setUp(self):
     self.seqs = [
         Peptide("SYFPEISYFP"),
         Protein("IHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYS", "ID-01",
                 "FOXP3")
     ]
     self.fragments = [Peptide("FSYFPEITHIR"), Peptide("FIHTIEPFYSR")]
Ejemplo n.º 2
0
 def test_smmtap_abitrary_peptide_length(self):
     smmtap = TAPPredictorFactory("smmtap")
     peptides = [
         Peptide("SYFPEITHI"),
         Peptide("IHTIEPFYSA"),
         Peptide("IHTIEPFYSAA")
     ]
     print smmtap.predict(peptides)
Ejemplo n.º 3
0
def generate_peptides_from_proteins(proteins, window_size, peptides=None):
    """
    Creates all :class:`~Fred2.Core.Peptide.Peptide` for a given window size, from a given
    :class:`~Fred2.Core.Protein.Protein`.

    The result is a generator.

    :param proteins: (Iterable of) protein(s) from which a list of unique peptides should be generated
    :type proteins: list(:class:`~Fred2.Core.Protein.Protein`) or :class:`~Fred2.Core.Protein.Protein`
    :param int window_size: Size of peptide fragments
    :param peptides: A list of peptides to update during peptide generation (usa case: Adding and updating Peptides of
                     newly generated Proteins)
    :type peptides: list(:class:`~Fred2.Core.Peptide.Peptide`)
    :return: A unique generator of peptides
    :rtype: Generator(:class:`~Fred2.Core.Peptide.Peptide`)
    """

    def gen_peptide_info(protein):
        # Generate peptide sequences and returns the sequence
        # #and start position within the protein
        res = []

        seq = str(protein)
        for i in xrange(len(protein)+1-window_size):
            # generate peptide fragment
            end = i+window_size
            pep_seq = seq[i:end]
            res.append((pep_seq, i))
        return res

    if isinstance(peptides, Peptide):
        peptides = [peptides]

    final_peptides = {}

    if peptides:
        for p in peptides:
            if not isinstance(p, Peptide):
                raise ValueError("Specified list of Peptides contain non peptide objects")
            final_peptides[str(p)] = p

    if isinstance(proteins, Protein):
        proteins = [proteins]

    for prot in proteins:
        if not isinstance(prot, Protein):
            raise ValueError("Input does contain non protein objects.")
        # generate all peptide sequences per protein:
        for (seq, pos) in gen_peptide_info(prot):
            if all(a in _allowed_aas for a in seq.upper()):
                t_id = prot.transcript_id
                if seq not in final_peptides:
                    final_peptides[seq] = Peptide(seq)
                final_peptides[seq].proteins[t_id] = prot
                final_peptides[seq].proteinPos[t_id].append(pos)

    return final_peptides.itervalues()
Ejemplo n.º 4
0
def generate_peptides_from_proteins(proteins, window_size, peptides=None):
    """
    Creates all :class:`~Fred2.Core.Peptide.Peptide` for a given window size, from a given
    :class:`~Fred2.Core.Protein.Protein`.

    The result is a generator.

    :param proteins: (Iterable of) protein(s) from which a list of unique peptides should be generated
    :type proteins: list(:class:`~Fred2.Core.Protein.Protein`) or :class:`~Fred2.Core.Protein.Protein`
    :param int window_size: Size of peptide fragments
    :param peptides: A list of peptides to update during peptide generation (usa case: Adding and updating Peptides of
                     newly generated Proteins)
    :type peptides: list(:class:`~Fred2.Core.Peptide.Peptide`)
    :return: A unique generator of peptides
    :rtype: Generator(:class:`~Fred2.Core.Peptide.Peptide`)
    """

    def gen_peptide_info(protein):
        # Generate peptide sequences and returns the sequence
        # #and start position within the protein
        res = []

        seq = str(protein)
        for i in xrange(len(protein)+1-window_size):
            # generate peptide fragment
            end = i+window_size
            pep_seq = seq[i:end]
            res.append((pep_seq, i))
        return res

    if isinstance(peptides, Peptide):
        peptides = [peptides]

    final_peptides = {}

    if peptides:
        for p in peptides:
            if not isinstance(p, Peptide):
                raise ValueError("Specified list of Peptides contain non peptide objects")
            final_peptides[str(p)] = p

    if isinstance(proteins, Protein):
        proteins = [proteins]

    for prot in proteins:
        if not isinstance(prot, Protein):
            raise ValueError("Input does contain non protein objects.")
        # generate all peptide sequences per protein:
        for (seq, pos) in gen_peptide_info(prot):
            if all(a in _allowed_aas for a in seq.upper()):
                t_id = prot.transcript_id
                if seq not in final_peptides:
                    final_peptides[seq] = Peptide(seq)
                final_peptides[seq].proteins[t_id] = prot
                final_peptides[seq].proteinPos[t_id].append(pos)

    return final_peptides.itervalues()
Ejemplo n.º 5
0
    def test_simple_assembly(self):
        """
        Simple test if everything works. Solution manually tested for optimality.

        :return:
        """
        pred = CleavageSitePredictorFactory("PCM")
        assembler = EpitopeAssembly(self.peptides, pred, solver="glpk", verbosity=0)
        r = assembler.solve()
        self.assertEqual(r, [Peptide("YLYDHLAPM"), Peptide("ALYDVVSTL"), Peptide("KLLPRLPGV")])
Ejemplo n.º 6
0
def generate_peptides_from_protein(proteins, window_size, peptides=None):
    """
    Creates all peptides for a given window size, from a given protein. The
    result is a generator.

    :param Protein protein: (iterable of) protein(s) from which a list of unique
                            peptides should be generated
    :param int window_size: size of peptide fragments
    :param list(Peptide) peptides: a list of peptides to update during peptide generation
                                (usa case: Adding and updating Peptides of newly generated Proteins)
    """

    def gen_peptide_info(protein):
        # Generate peptide sequences and find the variants within each
        res = []

        seq = str(protein)
        for i in xrange(len(protein)+1-window_size):
            # generate peptide fragment
            end = i+window_size
            pep_seq = seq[i:end]

            res.append((pep_seq, i))
        return res

    if isinstance(peptides, Peptide):
        peptides = [peptides]

    if peptides and any(not isinstance(p, Peptide) for p in peptides):
        raise ValueError("Specified list of Peptides contain non peptide objects")

    final_peptides = {} if peptides is None else {str(p):p for p in peptides}

    if isinstance(proteins, Protein):
        proteins = [proteins]

    for prot in proteins:
        if not isinstance(prot, Protein):
            raise ValueError("Input does contain non protein objects.")
        # generate all peptide sequences per protein:
        for (seq, pos) in gen_peptide_info(prot):

            t_id = prot.transcript_id
            if seq not in final_peptides:
                final_peptides[seq] = Peptide(seq)

            final_peptides[seq].proteins[t_id] = prot
            final_peptides[seq].proteinPos[t_id].append(pos)

    return final_peptides.values()
Ejemplo n.º 7
0
    def setUp(self):
        self.proteins=[]
        self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")]
        self.peptides = [Peptide(p) for p in """SFSIFLLAL
GHRMAWDMM
VYEADDVIL
CFTPSPVVV
FLLLADARV
GPADGMVSK
YLYDHLAPM
GLRDLAVAV
GPTPLLYRL
TWVLVGGVL
IELGGKPAL
LAGGVLAAV
QYLAGLSTL
NFVSGIQYL
VLSDFKTWL
ARPDYNPPL
KLLPRLPGV
RHTPVNSWL
GLYLFNWAV
ALYDVVSTL
RRCRASGVL
WPLLLLLLA
VTYSLTGLW
YFVIFFVAA""".split()]
        self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
        self.thresh = {"A*01:01":10,"B*07:02":10,"C*03:01":10}
Ejemplo n.º 8
0
def extractEpitopesAndConservationFromConsensus(consensus_info, epitope_length):
    # print "consensus_info", consensus_info
    error = ''
    epitope_data = []
    epitope_antigen_data = {}
    antigens = consensus_info.keys()
    conservation = {}
    epitopes = {}

    # print "type of consensus_info ",type(consensus_info)
    for ci in consensus_info.values():
        consensus = str(ci[0][0])
        frequencies = ci[0][1]

        # consensus = consensus.upper()
        # if isValidAASequence(consensus):

        for i in xrange(len(consensus) - epitope_length + 1):
            epitope = Peptide(consensus[i:i + epitope_length])
            # print "peptide", epitope
            co = numpy.product(frequencies[i:i + epitope_length])

            if not conservation.has_key(epitope):
                # print "test type prot in epitopes", epitope.proteins
                conservation[epitope] = co
                epitopes[epitope] = epitope
            else:
                epitope = epitopes[epitope]
                # print "test type prot in epitopes", epitope.proteins
                if conservation[epitope] < co:
                    conservation[epitope] = co
    return (error, conservation)
Ejemplo n.º 9
0
def read_peptide_input(filename):
    peptides = []
    metadata = []
    '''expected columns (min required): id sequence'''
    with open(filename, 'r') as peptide_input:
        reader = csv.DictReader(peptide_input, delimiter='\t')
        for row in reader:
            pep = Peptide(row['sequence'])

            for col in row:
                if col != 'sequence':
                    pep.log_metadata(col, row[col])
                    metadata.append(col)
            peptides.append(pep)

    metadata = set(metadata)
    return peptides, metadata
def read_peptide_input(filename):
    peptides = []
    '''expected columns (min required): id sequence'''
    with open(filename, 'r') as peptide_input:
        # enable listing of protein names for each peptide
        csv.field_size_limit(600000)
        reader = csv.DictReader(peptide_input, delimiter='\t')
        for row in reader:
            pep = Peptide(row['sequence'])
            peptides.append(pep)

    return peptides
Ejemplo n.º 11
0
    def setUp(self):
        self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
        testsequences_file = pkg_resources.resource_filename(
            'Fred2', path.join('Data', 'examples', 'testSequences.fasta'))
        with open(testsequences_file, "rU") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
        prot_set = [Protein(str(r.seq)) for r in records]
        unique_test_pep_set = generate_peptides_from_proteins(prot_set, 9)
        self.selfpeptides = [str(x) for x in unique_test_pep_set]

        small_prot_set = [
            Protein(
                "MKERRIDMKEKNVKAKAPNKKVLGLTTKIFIALLAGAILGIVLCYLVPDSSFKKDVIVEGILYVIGQGFIRLMKMLVVPLVFCSLVCGSMAIGDTKKLGTVGVRTLAFYLATTALAVVALGVGNLINPGVGLDMSAIQSSAASVETMEATSLTDTILNIIPDNPINSLASGSMLQVIVFALIVGVILAKMGERAETVANFFSQFNDIMMEMTMMIMSLAPIGVFCLISRTFANIGFSAFIPLAKYMIGVLLALAIQCFGVYQILLKIFTGLNPIRFIKKFFPVMAFAFSTATSNATIPMSIDTLSKKVGVSKKISSFTIPLGATINMDGTSIMQGVAVVFAAQAFGIHLTPMDYVTVIGTATLASVGTAGVPSVGLVTLTMVFNSVGLPVEAIGLIMGIDRILDMTRTAVNITGDAVCTTIVAHQNGALDKKVFNETE"
            ),
            Protein(
                "MLKVWIAGASGQIGRALNDVLDPMQIEALNTDLDELDITDTDEVINFGTVNRPDVIINCTGITDTDECEANPEHAYRVNALGARNLSIVARKCGSKIVQLSTDDVFDGQSKKPYTEFDDTNPLTVYGRSKRAGENYVKEFTHKHFVIRSNWVYGHGGHNFVNRVLAAAEAGNGLSVASDQFGSPTSAKDLAKMIMYLISTNEYGTYHVTCRGVCSRYEFAQEILKLAGKDIELRAVPTEQSDLSAVRPPYAVLDNFILRIIEVYDMPDWKESLKEYMDERTED"
            )
        ]
        small_unique_test_pep_set = generate_peptides_from_proteins(
            small_prot_set, 9)
        self.fewselfpeptides = [str(x) for x in small_unique_test_pep_set]
Ejemplo n.º 12
0
 def setUp(self):
     self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
Ejemplo n.º 13
0
    def approximate(self, start=0, threads=1, options=None):
        """
        Approximates the Eptiope Assembly problem by applying Lin-Kernighan traveling salesman heuristic

        LKH implementation must be downloaded, compiled, and globally executable.

        Source code can be found here:
        http://www.akira.ruc.dk/~keld/research/LKH/

        :param int start: Start length for spacers (default 0).
        :param int threads: Number of threads used for spacer design. Be careful, if options contain solver threads it
                            will allocate threads*solver_threads cores!
        :param dict(str,str) options: Solver specific options (threads for example)
        :return: A list of ordered :class:`~Fred2.Core.Peptide.Peptide`
        :rtype: list(:class:`~Fred2.Core.Peptide.Peptide`)
        """
        def __load_model(name, model):
            return getattr(
                __import__("Fred2.Data.pssms." + name + ".mat." + model,
                           fromlist=[model]), model)

        options = dict() if options is None else options
        threads = mp.cpu_count() if threads is None else threads
        pool = mp.Pool(threads)

        #prepare parameters
        cn = min(self.__clev_pred.supportedLength)
        cl_pssm = __load_model(self.__clev_pred.name,
                               self.__clev_pred.name + "_" + str(cn))
        cleav_pos = self.__clev_pred.cleavagePos
        en = self.__en
        epi_pssms = {}
        allele_prob = {}
        for a in self.__alleles:
            allele_prob[a.name] = a.prob
            pssm = __load_model(
                self.__epi_pred.name,
                "%s_%i" % (self.__epi_pred.convert_alleles([a])[0], en))
            for j, v in pssm.iteritems():
                for aa, score in v.iteritems():
                    if self.__epi_pred.name in [
                            "smm", "smmpmbec", "comblibsidney"
                    ]:
                        epi_pssms[j, aa, a.name] = 1 / 10. - math.log(
                            math.pow(10, score), 50000)
                        self.__thresh = {
                            k: (1 - math.log(v, 50000) if v != 0 else 0)
                            for k, v in self.__thresh.iteritems()
                        }
                    else:
                        epi_pssms[j, aa, a.name] = score

        if not epi_pssms:
            raise ValueError(
                "Selected alleles with epitope length are not supported by the prediction method."
            )

        #print "run spacer designs in parallel using multiprocessing"
        res = pool.map(
            _runs_lexmin,
            ((str(ei), str(ej), i, en, cn, cl_pssm, epi_pssms, cleav_pos,
              allele_prob, self.__alpha, self.__thresh, self.__solver,
              self.__beta, options) for i in xrange(start, self.__k + 1)
             for ei, ej in itr.product(self.__peptides, repeat=2) if ei != ej))
        pool.close()
        pool.join()

        opt_spacer = {}
        adj_matrix = {}
        inf = float("inf")
        #print res
        #print "find best scoring spacer for each epitope pair"
        for ei, ej, score, epi, spacer, c1, c2, non_c in res:
            if adj_matrix.get((ei, ej), inf) > -min(c1, c2):
                adj_matrix[(ei, ej)] = -min(c1, c2)
                opt_spacer[(ei, ej)] = spacer

        self.spacer = opt_spacer
        #print "solve assembly with generated adjacency matrix"
        assembler = EpitopeAssembly(self.__peptides,
                                    self.__clev_pred,
                                    solver=self.__solver,
                                    matrix=adj_matrix)
        res = assembler.approximate()

        #generate output
        sob = []
        for i in xrange(len(res) - 1):
            ei = str(res[i])
            ej = str(res[i + 1])
            if not i:
                sob.append(Peptide(ei))
            sob.append(Peptide(opt_spacer[ei, ej]))
            sob.append(Peptide(ej))
        return sob
Ejemplo n.º 14
0
    def solve(self, start=0, threads=None, options=None):
        """
        Solve the epitope assembly problem with spacers optimally using integer linear programming.

        .. note::

            This can take quite long and should not be done for more and 30 epitopes max!
            Also, one has to disable pre-solving steps in order to use this model.

        :param int start: Start length for spacers (default 0).
        :param int threads: Number of threads used for spacer design.
                            Be careful, if options contain solver threads it will allocate threads*solver_threads cores!
        :param dict(str,str) options: Solver specific options as keys and parameters as values
        :return: A list of ordered :class:`~Fred2.Core.Peptide.Peptide`
        :rtype: list(:class:`~Fred2.Core.Peptide.Peptide`)
        """
        def __load_model(name, model):
            return getattr(
                __import__("Fred2.Data.pssms." + name + ".mat." + model,
                           fromlist=[model]), model)

        options = dict() if options is None else options
        threads = mp.cpu_count() if threads is None else threads
        pool = mp.Pool(threads)

        #prepare parameters
        cn = min(self.__clev_pred.supportedLength)
        cl_pssm = __load_model(self.__clev_pred.name,
                               self.__clev_pred.name + "_" + str(cn))
        cleav_pos = self.__clev_pred.cleavagePos
        en = self.__en
        epi_pssms = {}
        allele_prob = {}
        for a in self.__alleles:
            allele_prob[a.name] = a.prob
            pssm = __load_model(
                self.__epi_pred.name,
                "%s_%i" % (self.__epi_pred.convert_alleles([a])[0], en))
            for j, v in pssm.iteritems():
                for aa, score in v.iteritems():
                    if self.__epi_pred.name in [
                            "smm", "smmpmbec", "comblibsidney"
                    ]:
                        epi_pssms[j, aa, a.name] = 1 / 10. - math.log(
                            math.pow(10, score), 50000)
                        self.__thresh = {
                            k: (1 - math.log(v, 50000) if v != 0 else 0)
                            for k, v in self.__thresh.iteritems()
                        }
                    else:
                        epi_pssms[j, aa, a.name] = score

        #print "run spacer designs in parallel using multiprocessing"
        res = pool.map(
            _runs_lexmin,
            ((str(ei), str(ej), i, en, cn, cl_pssm, epi_pssms, cleav_pos,
              allele_prob, self.__alpha, self.__thresh, self.__solver,
              self.__beta, options) for i in xrange(start, self.__k + 1)
             for ei, ej in itr.product(self.__peptides, repeat=2) if ei != ej))
        pool.close()
        pool.join()

        opt_spacer = {}
        adj_matrix = {}
        inf = float("inf")
        #print res
        #print "find best scoring spacer for each epitope pair"
        for ei, ej, score, epi, spacer, c1, c2, non_c in res:
            #print ei,spacer,ej,min(c1,c2),c1,c2
            if adj_matrix.get((ei, ej), inf) > -min(c1, c2):
                adj_matrix[(ei, ej)] = -min(c1, c2)
                opt_spacer[(ei, ej)] = spacer

        self.spacer = opt_spacer
        #print "solve assembly with generated adjacency matrix"
        assembler = EpitopeAssembly(self.__peptides,
                                    self.__clev_pred,
                                    solver=self.__solver,
                                    matrix=adj_matrix)
        res = assembler.solve(options=options)

        #generate output
        sob = []
        for i in xrange(len(res) - 1):
            ei = str(res[i])
            ej = str(res[i + 1])
            if not i:
                sob.append(Peptide(ei))
            sob.append(Peptide(opt_spacer[ei, ej]))
            sob.append(Peptide(ej))
        return sob
Ejemplo n.º 15
0
    def __init__(self,
                 peptides,
                 pred,
                 solver="glpk",
                 weight=0.0,
                 matrix=None,
                 verbosity=0):

        if not isinstance(pred, ACleavageSitePrediction):
            raise ValueError(
                "Cleave site predictor must be of type ACleavageSitePrediction"
            )

        if len(peptides) > 60:
            warnings.warn(
                "The peptide set exceeds 60. Above this level one has to expect "
                +
                "considerably long running times due to the complexity of the problem."
            )

        #Generate model
        #1. Generate peptides for which cleave sites have to be predicted
        #2. generate graph with dummy element
        self.__verbosity = verbosity

        pep_tmp = peptides[:]
        pep_tmp.append("Dummy")
        edge_matrix = {}
        fragments = {}
        seq_to_pep = {}
        self.neo_cleavage = {}
        self.good_cleavage = {}

        if matrix is None:
            for start, stop in itr.combinations(pep_tmp, 2):
                if start == "Dummy" or stop == "Dummy":
                    seq_to_pep[str(start)] = start
                    seq_to_pep[str(stop)] = stop
                    edge_matrix[(str(start), str(stop))] = 0
                    edge_matrix[(str(stop), str(start))] = 0
                else:
                    start_str = str(start)
                    stop_str = str(stop)
                    frag = Peptide(start_str + stop_str)
                    garf = Peptide(stop_str + start_str)

                    fragments[frag] = (start_str, stop_str)
                    fragments[garf] = (stop_str, start_str)

            cleave_pred = pred.predict(fragments.keys())
            #cleave_site_df = cleave_pred.xs((slice(None), (cleavage_pos-1)))
            for i in set(cleave_pred.index.get_level_values(0)):
                fragment = "".join(cleave_pred.ix[i]["Seq"])
                start, stop = fragments[fragment]

                cleav_pos = len(str(start)) - 1
                edge_matrix[(start, stop)] = -1.0 * (
                    cleave_pred.loc[(i, len(str(start)) - 1), pred.name] -
                    weight * sum(cleave_pred.loc[(i, j), pred.name]
                                 for j in xrange(cleav_pos - 1, cleav_pos +
                                                 4, 1) if j != cleav_pos))

                self.neo_cleavage[(start, stop)] = sum(
                    cleave_pred.loc[(i, j), pred.name]
                    for j in xrange(cleav_pos - 1, cleav_pos + 4, 1)
                    if j != cleav_pos)
                self.good_cleavage[(start,
                                    stop)] = cleave_pred.loc[(i,
                                                              len(str(start)) -
                                                              1), pred.name]
        else:
            edge_matrix = matrix
            seq_to_pep = {str(p): p for p in pep_tmp}
            for p in seq_to_pep.iterkeys():
                if p != "Dummy":
                    edge_matrix[(p, "Dummy")] = 0
                    edge_matrix[("Dummy", p)] = 0
        self.__seq_to_pep = seq_to_pep

        #3. initialize ILP
        self.__solver = SolverFactory(solver)
        model = ConcreteModel()

        E = filter(lambda x: x != "Dummy", seq_to_pep.keys())
        model.E = Set(initialize=E)
        model.E_prime = Set(initialize=seq_to_pep.keys())
        model.ExE = Set(initialize=itr.permutations(E, 2), dimen=2)

        model.w_ab = Param(model.E_prime,
                           model.E_prime,
                           initialize=edge_matrix)
        model.card = Param(initialize=len(model.E_prime))

        model.x = Var(model.E_prime, model.E_prime, within=Binary)
        model.u = Var(model.E, domain=PositiveIntegers, bounds=(2, model.card))

        model.obj = Objective(
            rule=lambda mode: sum(model.w_ab[a, b] * model.x[a, b]
                                  for a in model.E_prime for b in model.E_prime
                                  if a != b),
            sense=minimize)

        model.tour_constraint_1 = Constraint(
            model.E_prime,
            rule=lambda model, a: sum(model.x[a, b] for b in model.E_prime
                                      if a != b) == 1)
        model.tour_constraint_2 = Constraint(
            model.E_prime,
            rule=lambda model, a: sum(model.x[b, a] for b in model.E_prime
                                      if a != b) == 1)
        model.cardinality_constraint = Constraint(
            model.ExE,
            rule=lambda model, a, b: model.u[a] - model.u[b] + 1 <=
            (model.card - 1) * (1 - model.x[a, b]))

        self.instance = model
        if self.__verbosity > 0:
            print "MODEL INSTANCE"
            self.instance.pprint()
Ejemplo n.º 16
0
 def setUp(self):
     #Peptides of different length 9,10,11,12,13,14,15
     self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
     self.peptides_fragment = [Peptide("IHTIEPFYSAA")]
     self.mhcI = [Allele("HLA-B*15:01"), Allele("HLA-A*02:01")]
     self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
Ejemplo n.º 17
0
def generate_peptides_from_protein(proteins, window_size):
    """
    Creates all peptides for a given window size, from a given protein. The
    result is a generator.

    :param Protein protein: (list of) protein(s) from which a list of unique
                            peptides should be generated
    :param int window_size: size of peptide fragments
    """
    def frameshift_influences(tid, _vars, res, start):
        # find variants out side the peptide frame, still influencing it via a
        # frameshift
        accu = [] # accumulator for relevant variants

        _vars.sort(key=lambda v: v.genomePos) # necessary?
        shift = 0

        for var in _vars:

            pos = var.get_protein_position(tid)
            new_shift = var.get_shift()

            if pos < start:
                # does a variant yield a frame shift?
                if shift + new_shift:
                    shift += new_shift
                    accu.append(var)
                else:
                    accu = {}
            # here: var.get_protein_position >= start, we are done!
            else:
                res += accu
                break

    def gen_peptide_info(protein):
        # Generate peptide sequences and find the variants within each
        res = []

        seq = str(protein)
        for i in xrange(len(protein)+1-window_size):
            # generate peptide fragment
            end = i+window_size
            pep_seq = seq[i:end]

             # get the variants affecting the peptide:
            if protein.vars:
                # variants within the peptide:
                pep_var = [var for pos, var_list in protein.vars.iteritems() \
                           for var in var_list if i <= pos <= end]

                # outside variants that affect the peptide via frameshift:
                frameshift_influences(protein.transcript_id, 
                                      protein.orig_transcript.vars.values(),
                                      pep_var, i)
            else:
                pep_var = []

            res.append((pep_seq, pep_var))
        return res

    final_peptides = {} # sequence : peptide-instance

    if isinstance(proteins, Protein):
        proteins = [proteins]

    if any(not isinstance(p, Protein) for p in proteins):
        raise ValueError("Input does contain non protein objects.")

    for prot in proteins:
        # generate all peptide sequences per protein:
        for (seq, _vars) in gen_peptide_info(prot):

            t_id = prot.transcript_id
            if seq not in final_peptides:
                final_peptides[seq] = Peptide(seq)

            final_peptides[seq].proteins[t_id] = prot
            final_peptides[seq].vars[t_id] = _vars
            final_peptides[seq].transcripts[t_id] = prot.orig_transcript

    return final_peptides.values()
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-c',
                        dest="mhcclass",
                        help='<Required> MHC class',
                        required=True)
    parser.add_argument('-in',
                        dest="inf",
                        help='<Required> full path to the input file',
                        required=True)
    parser.add_argument('-out',
                        dest="out",
                        help="<Required> full path to the output file",
                        required=True)
    parser.add_argument(
        '-allele',
        dest="allele",
        help=
        "<Required> full path to an allele file, if 'in', allele file will be deduced from in file name",
        required=True)
    parser.add_argument(
        '-dirallele',
        dest="dirallele",
        help=
        "for use with '-allele in', describes full base path to the allele files"
    )

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out or options.allele):
        parser.print_help()
        sys.exit(1)

    target_alleles_set = set()
    #Fred2.FileReader.read_lines is broken
    #alleles = FileReader.read_lines(options.allele, type=Allele)
    if options.allele == "in" and options.dirallele:
        if "_W_" not in options.inf:
            print "No class 1 type run detected."
            sys.exit(0)
        af = None
        for sp in options.inf.split("_"):
            if sp.startswith("BD"):
                af = join(options.dirallele, sp.split("-")[1] + ".allele")
        with open(af, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))
    else:
        with open(options.allele, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))

    if not target_alleles_set:
        parser.print_help()
        sys.exit(1)

    if options.mhcclass == "I":
        ttn = EpitopePredictorFactory('netmhcpan', version='3.0')
        lowerBound = 8
        upperBound = 12
    elif options.mhcclass == "II":
        ttn = EpitopePredictorFactory('netmhcIIpan', version='3.1')
        lowerBound = 15
        upperBound = 25

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    pepstr = set()
    for pep in peps:
        for h in pep.getHits():
            #if "decoy" not in h.getMetaValue("target_decoy"):
            unmod = h.getSequence().toUnmodifiedString()
            if lowerBound <= len(unmod) <= upperBound \
                    and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod:
                pepstr.add(h.getSequence().toUnmodifiedString())

    es = [Peptide(x) for x in pepstr]

    try:
        preds_n = ttn.predict(es, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the netMHC prediction", options.inf, "what:", str(
            e)
        sys.exit(1)

    #only max
    preds = dict()
    for index, row in preds_n.iterrows():
        score = row.max()  #bigger_is_better
        allele = str(row.idxmax())
        categ = categorize(score)
        seq = row.name[0].tostring()
        if categ:
            preds[seq] = (allele, categ, score)

    npeps = list()
    for pep in peps:
        hits = pep.getHits()
        nhits = list()
        for h in hits:
            if h.getSequence().toUnmodifiedString() in preds:
                x = preds[h.getSequence().toUnmodifiedString()]
                h.setMetaValue('binder', x[0])
                h.setMetaValue(str(x[1]), x[2])
                nhits.append(h)
            else:
                nhits.append(h)
        pep.setHits(nhits)

    f.store(options.out, pros, peps)
Ejemplo n.º 19
0
def run_sequential(input_epitopes, input_alleles, input_affinities,
                   output_vaccine, num_epitopes, min_alleles, min_proteins,
                   solver, **kwargs):

    epitope_data = {
        k: v
        for k, v in utilities.load_epitopes(input_epitopes).items()
        if 'X' not in k
    }
    LOGGER.info('Loaded %d epitopes', len(epitope_data))

    peptide_coverage = {
        # we don't really need the actual protein sequence, just fill it with the id to make it unique
        Peptide(r['epitope']):
        set(Protein(gid, gene_id=gid) for gid in r['proteins'])
        for r in epitope_data.values()
    }

    allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict(
        'index')
    alleles = [
        Allele(allele.replace('HLA-', ''), prob=data['frequency'] / 100)
        for allele, data in allele_data.items()
    ]
    threshold = {
        allele.replace('HLA-', ''): data['threshold']
        for allele, data in allele_data.items()
    }
    LOGGER.info('Loaded %d alleles', len(threshold))

    affinities = affinities_from_csv(input_affinities,
                                     allele_data,
                                     peptide_coverage=peptide_coverage)
    LOGGER.info('Loaded %d affinities', len(affinities))

    LOGGER.info('Selecting epitopes...')
    model = OptiTope(affinities, threshold, k=num_epitopes, solver=solver)
    if min_alleles is not None:
        model.activate_allele_coverage_const(min_alleles)
    if min_proteins is not None:
        model.activate_antigen_coverage_const(min_proteins)
    selected_epitopes = model.solve()

    LOGGER.info('Creating spacers...')
    vaccine = EpitopeAssemblyWithSpacer(selected_epitopes,
                                        PCM(),
                                        BIMAS(),
                                        alleles,
                                        threshold=threshold,
                                        solver=solver).solve()

    immunogen = sum(epitope_data[str(e)]['immunogen'] for e in vaccine[::2])
    sequence = ''.join(map(str, vaccine))
    cleavage = pcm.DoennesKohlbacherPcm().cleavage_per_position(sequence)

    with open(output_vaccine, 'w') as f:
        writer = csv.DictWriter(
            f, ('immunogen', 'vaccine', 'spacers', 'cleavage'))
        writer.writeheader()
        writer.writerow({
            'immunogen': immunogen,
            'vaccine': sequence,
            'spacers': ';'.join(str(e) for e in vaccine[1::2]),
            'cleavage': ';'.join('%.3f' % c for c in cleavage)
        })
Ejemplo n.º 20
0
 def setUp(self):
     self.peptides = [Peptide("KLLPRLPGV"), Peptide("YLYDHLAPM"), Peptide("ALYDVVSTL")]
Ejemplo n.º 21
0
    def setUp(self):
        epis ="""GHRMAWDMM
                 VYEADDVIL""".split("\n")

        self.epis = map(lambda x: Peptide(x.strip()),epis)
        self.alleles =[Allele("HLA-A*02:01",prob=0.5)]
Ejemplo n.º 22
0
    def setUp(self):
        epis = """GHRMAWDMM
                 VYEADDVIL""".split("\n")

        self.epis = [Peptide(x.strip()) for x in epis]
        self.alleles = [Allele("HLA-A*02:01", prob=0.5)]