Ejemplo n.º 1
0
 def test_single_peptide_input_mhcII(self):
     for m in EpitopePredictorFactory.available_methods():
         model = EpitopePredictorFactory(m)
         if not isinstance(model, AExternalEpitopePrediction):
             if all(a.name in model.supportedAlleles for a in self.mhcII):
                 res = model.predict(self.peptides_mhcII[0],
                                     alleles=self.mhcII[1])
Ejemplo n.º 2
0
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (
                     mo.version == "0.1" and mo.name == "netmhc"):
                 print "Testing", mo.name, "version", mo.version
                 try:
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcII):
                         mo.predict(self.peptides_mhcII,
                                    alleles=self.mhcII[0])
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcII_combined_alleles):
                         mo.predict(self.peptides_mhcII,
                                    alleles=self.mhcII_combined_alleles[0])
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcI):
                         mo.predict(self.peptides_mhcI,
                                    alleles=self.mhcI[0])
                     print "Success"
                 except RuntimeError as e:  #catch only those stemming from binary unavailability
                     if "could not be found in PATH" not in e.message:
                         raise e  #all others do not except
                     else:
                         print mo.name, "not available"
Ejemplo n.º 3
0
    def est_multiple_peptide_input_mhcI(self):

            for m in EpitopePredictorFactory.available_methods():

                model = EpitopePredictorFactory(m)
                if all( a.name in model.supportedAlleles for a in self.mhcI):
                    res = model.predict(self.peptides_mhcI,alleles=self.mhcI)
 def test_path_option_and_optional_parameters(self):
     netmhc = EpitopePredictorFactory("NetMHC")
     exe = netmhc.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, path=exe_try, options="--sort")
Ejemplo n.º 5
0
 def test_path_and_optional_parameters_netctl(self):
     netctlpan = EpitopePredictorFactory("NetCTLpan")
     exe = netctlpan.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI,
                                     commad=exe_try,
                                     options="-wt 0.05 -wc 0.225 -ethr 0.5")
 def test_path_and_optional_parameters_netctl(self):
     netctlpan = EpitopePredictorFactory("NetCTLpan")
     exe = netctlpan.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI,
                                     commad=exe_try,
                                     options="-wt 0.05 -wc 0.225 -ethr 0.5")
 def test_path_option_and_optional_parameters_netmhc(self):
     netmhc = EpitopePredictorFactory("NetMHC")
     exe = netmhc.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1)
             self.assertTrue(len(r) == len(self.peptides_mhcI))
             self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None)
             self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
def run_predictor(pred, dataset):
	predictor = EpitopePredictorFactory(pred)
	results = ()
	try:
		results = predictor.predict(dataset, alleles=[ Allele(a) for a in args.allele ])
		print(results)
		print(results.describe())
	except ValueError:
		pass
	
	return(len(results),len(dataset))
Ejemplo n.º 9
0
 def test_path_option_and_optional_parameters_netmhc(self):
     netmhc = EpitopePredictorFactory("NetMHC")
     exe = netmhc.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1)
             self.assertTrue(len(r) == len(self.peptides_mhcI))
             self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None)
             self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
Ejemplo n.º 10
0
def run_predictor(pred, dataset):
    predictor = EpitopePredictorFactory(pred)
    results = ()
    try:
        results = predictor.predict(dataset,
                                    alleles=[Allele(a) for a in args.allele])
        print(results)
        print(results.describe())
    except ValueError:
        pass

    return (len(results), len(dataset))
Ejemplo n.º 11
0
    def test_pareto_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name:10000 for a in allele}
        comp = lambda a,b: a <= b

        print ep_pred.predict(self.peptides,alleles=allele)
        #cl_pred, ep_pred, alleles, threshold, comparator, length=9

        assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1)
        r = assembler.solve(eps=1e10, order=(1,0))
        print r
Ejemplo n.º 12
0
    def test_pareto_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name:10000 for a in allele}
        comp = lambda a,b: a <= b

        print(ep_pred.predict(self.peptides,alleles=allele))
        #cl_pred, ep_pred, alleles, threshold, comparator, length=9

        assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1)
        r = assembler.solve(eps=1e10, order=(1,0))
        print(r)
Ejemplo n.º 13
0
    def setUp(self):
        self.proteins=[]
        self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")]
        self.peptides = [Peptide(p) for p in """SFSIFLLAL
GHRMAWDMM
VYEADDVIL
CFTPSPVVV
FLLLADARV
GPADGMVSK
YLYDHLAPM
GLRDLAVAV
GPTPLLYRL
TWVLVGGVL
IELGGKPAL
LAGGVLAAV
QYLAGLSTL
NFVSGIQYL
VLSDFKTWL
ARPDYNPPL
KLLPRLPGV
RHTPVNSWL
GLYLFNWAV
ALYDVVSTL
RRCRASGVL
WPLLLLLLA
VTYSLTGLW
YFVIFFVAA""".split()]
        self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
        self.thresh = {"A*01:01":10,"B*07:02":10,"C*03:01":10}
Ejemplo n.º 14
0
    def setUp(self):
        self.proteins=[]
        self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")]
        self.peptides = [Peptide(p) for p in """SFSIFLLAL
GHRMAWDMM
VYEADDVIL
CFTPSPVVV
FLLLADARV
GPADGMVSK
YLYDHLAPM
GLRDLAVAV
GPTPLLYRL
TWVLVGGVL
IELGGKPAL
LAGGVLAAV
QYLAGLSTL
NFVSGIQYL
VLSDFKTWL
ARPDYNPPL
KLLPRLPGV
RHTPVNSWL
GLYLFNWAV
ALYDVVSTL
RRCRASGVL
WPLLLLLLA
VTYSLTGLW
YFVIFFVAA""".split()]
        self.result= EpitopePredictorFactory("NetMHC").predict(self.peptides, self.alleles)
        self.thresh = {"A*01:01":0,"B*07:02":0,"C*03:01":0}
Ejemplo n.º 15
0
def valid_predictors(supported_length=9,
                     exclude_predictors=["epidemix", "unitope", "netctlpan"]):
    """
    Get the infomation for all predictors and keep only
    the relevant ones.

    Args:
       supported_length (int): Supported peptide input length.
       exclude_predictors (list of chars): List of methods to remove in addition
    """

    methods = EpitopePredictorFactory.available_methods().keys()
    dt = pd.DataFrame([predictor_info(method) for method in methods])
    n_init = len(dt)

    dt = dt[[supported_length in elems for elems in dt["supportedLength"]]]
    dt = dt[dt["type"].notnull()]  # we should know where it was trained
    dt = dt[dt["is_in_path"].isnull() | dt["is_in_path"]]

    for excl_predictor in exclude_predictors:
        dt = dt[dt["name"] != excl_predictor]

    print("removed {0} methods from Fred2. {1} remain".\
          format(n_init - len(dt), len(dt)))

    return dt
Ejemplo n.º 16
0
def predictor_info(method):
    """
    Get all the information about a particular predictor/method from Fred2
    """

    predictor = EpitopePredictorFactory(method)
    try:
        is_in_path = predictor.is_in_path()
    except:
        is_in_path = None
    try:
        command = predictor.command
    except:
        command = None

    method_hash = {
        "syfpeithi": "T-cell epitope",
        "bimas": "MHC-I binding",
        "svmhc": "MHC-I binding",
        "arb": "MHC-I binding",
        "smm": "MHC-I binding",
        "smmpmbec": "MHC-I binding",
        "epidemix": "MHC-I binding",
        "comblib": "MHC-I binding",
        "comblibsidney": "MHC-I binding",
        "pickpocket": "MHC-I binding",
        "netmhc": "MHC-I binding",
        "netmhcpan": "MHC-I binding",
        "hammer": "MHC-II binding",
        "tepitopepan": "MHC-II binding",
        "netmhcii": "MHC-II binding",
        "netmhciipan": "MHC-II binding",
        "unitope": "T-cell epitope",
        "netctlpan": "T-cell epitope",
    }

    retdict = {
        "is_in_path": is_in_path,
        "name": method,
        "supportedAlleles": predictor.supportedAlleles,
        "supportedLength": predictor.supportedLength,
        "command": command,
        "version": predictor.version,
        "type": method_hash.get(method)
    }
    return retdict
Ejemplo n.º 17
0
def predict_peptide_effects(peptides, alleles=None):
    """
    Predict the peptide effect for all the available methods on the machine

    Args:
        peptides (list of Peptides): Usually an output from read_fasta
        alleles (list of chars): Alleles for which to run the predictors

    Returns:
        pd.DataFrame: Tidy pd.DataFrame. If the method is unable to predict
                      for a particular value the rows are not present.

    Example:
    >>> peptides = [Peptide("SYFPEITHI"), Peptide("FIASNGVKL"), Peptide("LLGATCMFV")]
    >>> alleles = ['A*02:16', 'B*45:01']
    >>> predict_peptide_effects(peptides, alleles = alleles).head()
                               Seq    Method   allele       score
    0  (F, I, A, S, N, G, V, K, L)       arb  A*02:16  594.691144
    1  (F, I, A, S, N, G, V, K, L)       smm  A*02:16  159.768074
    2  (F, I, A, S, N, G, V, K, L)  smmpmbec  A*02:16  211.977614
    4  (F, I, A, S, N, G, V, K, L)   unitope  A*02:16    0.527849
    5  (L, L, G, A, T, C, M, F, V)       arb  A*02:16    6.784222
    """
    dt = valid_predictors()
    results = []
    for i in range(len(dt)):
        # subset to valid alleles
        if alleles is not None:
            valid_alleles = dt.iloc[i]["supportedAlleles"].intersection(
                alleles)

            if len(valid_alleles) == 0:
                continue
            valid_alleles = [Allele(al) for al in valid_alleles]
        else:
            valid_alleles = None
        method = dt.iloc[i]["name"]
        print("method: ", method)
        # TODO - use try, except
        t0 = time.time()

        try:
            results.append(
                EpitopePredictorFactory(method).predict(peptides,
                                                        alleles=valid_alleles))
        except:
            print("Error! Unable to run ", method, ": ", sys.exc_info())
        t1 = time.time()
        print("  - runtime: ", str(t1 - t0))

    df = results[0].merge_results(results[1:]).reset_index()
    dfm = pd.melt(df,
                  id_vars=["Seq", "Method"],
                  var_name="allele",
                  value_name="score")
    dfm = dfm[dfm["score"].notnull()]
    dfm.rename(columns={'Seq': 'peptide', 'Method': 'method'}, inplace=True)
    return dfm
Ejemplo n.º 18
0
 def test_epitope_conservation_constraint(self):
     import random
     self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
     conservation = {}
     for e in self.result.index.levels[0]:
         conservation[str(e)] = random.random()
     pt = OptiTope(self.result, self.thresh, k=3, solver="cbc", verbosity=0)
     pt.activate_epitope_conservation_const(0.5, conservation=conservation)
     for e in pt.solve():
         print e, conservation[e]
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"):
                 if any(a.name in mo.supportedAlleles for a in self.mhcII):
                     mo.predict(self.peptides_mhcII, alleles=self.mhcII[0])
                 else:
                     mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
Ejemplo n.º 20
0
 def test_epitope_conservation_constraint(self):
      import random
      self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
      conservation = {}
      print self.result.index.levels[0]
      for e in self.result.index.levels[0]:
          conservation[str(e)] = random.random()
      pt = OptiTope(self.result, self.thresh, k=3, solver="cplex" ,verbosity=1)
      pt.activate_epitope_conservation_const(0.5, conservation=conservation)
      for e in pt.solve():
          print e, conservation[e]
Ejemplo n.º 21
0
def __main__():
    parser = argparse.ArgumentParser(
        "Write out information about supported models by Fred2 for available prediction tool versions."
    )
    parser.add_argument('-v',
                        '--versions',
                        help='File with used software versions.',
                        required=True)
    args = parser.parse_args()

    # NOTE this needs to be updated manually, if other methods should be used in the future
    available_methods = [
        'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'
    ]
    with open(args.versions, 'r') as versions_file:
        tool_version = [(row[0].split()[0], str(row[1]))
                        for row in csv.reader(versions_file, delimiter=":")]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each method the corresponding tool version
        methods = {
            method.strip(): version.strip()
            for tool, version in tool_version for method in available_methods
            if tool.lower() in method.lower()
        }

    for method, version in methods.items():
        if (version
                not in EpitopePredictorFactory.available_methods()[method]):
            raise ValueError("The specified version " + version + " for " +
                             method + " is not supported by Fred2.")

        predictor = EpitopePredictorFactory(method, version=version)
        with open(method + ".v" + str(version) + ".supported_alleles.txt",
                  'w') as output:
            for a in sorted(predictor.supportedAlleles):
                output.write(convert_allele_back(a) + "\n")
        with open(method + ".v" + str(version) + ".supported_lengths.txt",
                  'w') as output:
            for l in sorted(predictor.supportedLength):
                output.write(str(l) + "\n")
Ejemplo n.º 22
0
    def test_pareto_front_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name:10000 for a in allele}
        comp = lambda a,b: a <= b

        assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=0)
        r = assembler.paretosolve()
        print(r)

        #print assembler.solve(eps=2.0)
Ejemplo n.º 23
0
    def test_standart_functions(self):
        """
        Tests default functions
        needs GLPK installed
        :return:
        """
        epi_pred =  EpitopePredictorFactory("Syfpeithi")
        cl_pred = CleavageSitePredictorFactory("PCM")

        sbws = EpitopeAssemblyWithSpacer(self.epis,cl_pred,epi_pred,self.alleles)
        sol = sbws.solve()
        print sol
        assert all(i == str(j) for i,j in zip(["GHRMAWDMM","HH","VYEADDVIL"],sol))
Ejemplo n.º 24
0
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"):
                 print "Testing", mo.name, "version", mo.version
                 try:
                     if any(a.name in mo.supportedAlleles for a in self.mhcII):
                         mo.predict(self.peptides_mhcII, alleles=self.mhcII[0])
                     if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles):
                         mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0])
                     if any(a.name in mo.supportedAlleles for a in self.mhcI):
                         mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
                     print "Success"
                 except RuntimeError as e: #catch only those stemming from binary unavailability
                     if "could not be found in PATH" not in e.message:
                         raise e #all others do not except
                     else:
                         print mo.name, "not available"
Ejemplo n.º 25
0
    def test_allele_cov_constraint(self):
        """
        tests the allele converage constraints

        :return:
        """
        #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")])
        #self.thresh.update({"A*02:01":0,"B*15:01":0})
        self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
        opt = OptiTope(self.result, self.thresh, k=3, solver="cbc", verbosity=0)
        opt.activate_allele_coverage_const(0.99)
        r = opt.solve()

        self.assertTrue(len(set(str(p) for p in r) - set(["GPTPLLYRL", "QYLAGLSTL", "ALYDVVSTL"])) == 0 )
Ejemplo n.º 26
0
 def test_unsupported_allele_length_combination_exception(self):
     """
     Tests default functions
     needs GLPK installed
     :return:
     """
     epi_pred = EpitopePredictorFactory("Syfpeithi")
     cl_pred = CleavageSitePredictorFactory("PCM")
     alleles = [Allele("HLA-A*26:01", prob=0.5)]
     sbws = EpitopeAssemblyWithSpacer(self.epis,
                                      cl_pred,
                                      epi_pred,
                                      alleles,
                                      solver="cbc")
     self.assertRaises(ValueError, sbws.solve)
Ejemplo n.º 27
0
 def test_unsupported_allele_length_combination(self):
     """
     Tests default functions
     needs GLPK installed
     :return:
     """
     epi_pred = EpitopePredictorFactory("Syfpeithi")
     cl_pred = CleavageSitePredictorFactory("PCM")
     alleles = [
         Allele("HLA-A*02:01", prob=0.5),
         Allele("HLA-A*26:01", prob=0.5)
     ]
     sbws = EpitopeAssemblyWithSpacer(self.epis,
                                      cl_pred,
                                      epi_pred,
                                      alleles,
                                      solver="cbc")
     sol = sbws.solve()
     print sol
     assert all(i == str(j)
                for i, j in zip(["GHRMAWDMM", "HH", "VYEADDVIL"], sol))
Ejemplo n.º 28
0
    def test_allele_cov_constraint(self):
         """
         tests the allele converage constraints

         :return:
         """
         #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")])
         #self.thresh.update({"A*02:01":0,"B*15:01":0})
         self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
         print self.result[self.alleles[0:2]]
         opt = OptiTope(self.result,self.thresh,k=3,solver="cplex",verbosity=1)
         opt.activate_allele_coverage_const(0.8)
         r = opt.solve()
         res_df = self.result.xs(self.result.index.values[0][1], level="Method")
         peps = [str(p) for p in r]

         probs = {"A*01:01":1, "A*02:01":1, "B*07:02":1,"B*15:01":1,"C*03:01":1.0}
         res_df = res_df.loc[peps, :]
         res_df = res_df[[a for a in self.alleles]]
         res_df = res_df[res_df.apply(lambda x: any(x[a] > self.thresh[a.name] for a in self.alleles), axis=1)]

         print res_df.apply(lambda x: sum( x[c]*probs[c.name] for c in res_df.columns),axis=1)

         self.assertTrue(len(set(str(p) for p in r) - set(["ALYDVVSTL", "KLLPRLPGV", "GPTPLLYRL"])) == 0 )
Ejemplo n.º 29
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True)
    parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True)
    parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction")
    parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH))
    parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict")
    parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.")
    parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
                        filemode='w+', level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

# MyObject = type('MyObject', (object,), {})
# options = MyObject()
# setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
#
# vt = os.path.splitext(options.var_file)[-1]
# if ".vcf" == vt:
#     vcfvars, accessions = FileReader.read_vcf(options.var_file)
#
# mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
#
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# transcripts = [x for x in transcript_gen if x.vars]
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
# proteins = [x for x in protein_gen if x.vars]
# for p in proteins:
#     p.gene_id = p.vars.values()[0][0].gene
#
#
# for t in transcripts:
#     t.gene_id = t.vars.values()[0].gene
#

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values()[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Ejemplo n.º 30
0
def main():

    model = argparse.ArgumentParser(
        description='Neoepitope prediction for TargetInsepctor.')

    model.add_argument(
        '-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method')

    model.add_argument('-v',
                       '--vcf',
                       type=str,
                       default=None,
                       help='Path to the vcf input file')

    model.add_argument(
        '-t',
        '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help=
        'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
    )

    model.add_argument('-p',
                       '--proteins',
                       type=str,
                       default=None,
                       help='Path to the protein ID input file (in HGNC-ID)')

    model.add_argument('-l',
                       '--length',
                       choices=range(8, 18),
                       type=int,
                       default=9,
                       help='The length of peptides')

    model.add_argument(
        '-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)')

    model.add_argument(
        '-r',
        '--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.')

    model.add_argument(
        '-fINDEL',
        '--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)')

    model.add_argument('-fFS',
                       '--filterFSINDEL',
                       action="store_true",
                       help='Filter frameshift INDELs')

    model.add_argument('-fSNP',
                       '--filterSNP',
                       action="store_true",
                       help='Filter SNPs')

    model.add_argument('-o',
                       '--output',
                       type=str,
                       required=True,
                       help='Path to the output file')
    model.add_argument('-etk',
                       '--etk',
                       action="store_true",
                       help=argparse.SUPPRESS)

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write(
            "At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf,
                                                     gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(
                lambda x: x.type not in [
                    VariationType.INS, VariationType.DEL, VariationType.FSDEL,
                    VariationType.FSINS
                ], variants)

        if args.filterFSINDEL:
            variants = filter(
                lambda x: x.type not in
                [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write(
                "No variants left after filtering. Please refine your filtering criteria.\n"
            )
            return -1

        epitopes = filter(
            lambda x: any(
                x.get_variants_by_protein(tid)
                for tid in x.proteins.iterkeys()),
            generate_peptides_from_variants(variants, int(args.length), martDB,
                                            EIdentifierTypes.ENSEMBL))

        for v in variants:
            for trans_id, coding in v.coding.iteritems():
                if coding.geneID != None:
                    transcript_to_genes[trans_id] = coding.geneID
                else:
                    transcript_to_genes[trans_id] = 'None'

    #else: generate protein sequences from given HGNC IDs and than epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(
                    l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(
                    ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[
                        EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(
                        Protein(
                            protein_seq,
                            gene_id=l.strip(),
                            transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = generate_peptides_from_proteins(proteins, int(args.length))

    #read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    result = EpitopePredictorFactory(args.method).predict(epitopes,
                                                          alleles=alleles)

    with open(args.output, "w") as f:
        alleles = result.columns
        var_column = " Variants" if args.vcf is not None else ""
        f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) +
                "\tAntigen ID\t" + var_column + "\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins = ",".join(
                set([
                    transcript_to_genes[prot.transcript_id.split(":FRED2")[0]]
                    for prot in p.get_all_proteins()
                ]))
            vars_str = ""

            if args.vcf is not None:
                vars_str = "\t" + "|".join(
                    set(
                        prot_id.split(":FRED2")[0] + ":" + ",".join(
                            repr(v)
                            for v in set(p.get_variants_by_protein(prot_id)))
                        for prot_id in p.proteins.iterkeys()
                        if p.get_variants_by_protein(prot_id)))

            f.write(
                str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a]
                                                          for a in alleles) +
                "\t" + proteins + vars_str + "\n")

    if args.etk:
        with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g:
            alleles = result.columns
            g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n")
            for index, row in result.iterrows():
                p = index[0]
                proteins = " ".join(
                    set([
                        transcript_to_genes[prot.transcript_id.split(
                            ":FRED2")[0]] for prot in p.get_all_proteins()
                    ]))
                g.write(
                    str(p) + "\t" + "\t".join("%.3f" % row[a]
                                              for a in alleles) + "\t" +
                    proteins + "\n")
    return 0
Ejemplo n.º 31
0
def make_predictions_from_peptides(peptides, methods, alleles, protein_db,
                                   identifier, metadata):
    # dictionaries for syfpeithi matrices max values and allele mapping
    max_values_matrices = {}
    allele_string_map = {}

    # list to hold dataframes for all predictions
    pred_dataframes = []

    # filter out self peptides if specified
    selfies = [str(p) for p in peptides if protein_db.exists(str(p))]
    peptides_filtered = [p for p in peptides if str(p) not in selfies]

    # sort peptides by length (for predictions)
    sorted_peptides = {}

    for p in peptides_filtered:
        length = len(str(p))
        if length in sorted_peptides:
            sorted_peptides[length].append(p)
        else:
            sorted_peptides[length] = [p]

    for peplen in sorted_peptides:
        all_peptides_filtered = sorted_peptides[peplen]
        results = []
        for m in methods:
            try:
                results.extend([
                    EpitopePredictorFactory(m.split('-')[0],
                                            version=m.split('-')[1]).predict(
                                                all_peptides_filtered,
                                                alleles=alleles)
                ])
            except:
                logging.warning(
                    "Prediction for length {length} and allele {allele} not possible with {method}. No model available."
                    .format(length=peplen,
                            allele=','.join([str(a) for a in alleles]),
                            method=m))

        # merge dataframes of the performed predictions
        if (len(results) == 0):
            continue
        df = results[0].merge_results(results[1:])

        df.insert(0, 'length', df.index.map(create_length_column_value))

        for a in alleles:
            conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype)
            allele_string_map['%s_%s' %
                              (a, peplen)] = '%s_%i' % (conv_allele, peplen)
            max_values_matrices['%s_%i' %
                                (conv_allele, peplen)] = get_matrix_max_score(
                                    conv_allele, peplen)

        # reset index to have index as columns
        df.reset_index(inplace=True)

        mandatory_columns = [
            'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type',
            'synonymous', 'homozygous', 'variant details (genomic)',
            'variant details (protein)'
        ]

        for header in mandatory_columns:
            if header not in metadata:
                df[header] = np.nan
            else:
                df[header] = df.apply(
                    lambda row: row[0].get_metadata(header)[0], axis=1)

        for c in list(set(metadata) - set(mandatory_columns)):
            df[c] = df.apply(lambda row: row[0].get_metadata(c)[0], axis=1)

        for c in df.columns:
            if '*' in str(c):
                idx = df.columns.get_loc(c)
                df.insert(
                    idx + 1, '%s affinity' % c,
                    df.apply(lambda x: create_affinity_values(
                        str(c), int(x['length']), float(x[c]), x['Method'],
                        max_values_matrices, allele_string_map),
                             axis=1))
                df.insert(
                    idx + 2, '%s binder' % c,
                    df.apply(lambda x: create_binder_values(
                        float(x['%s affinity' % c]), x['Method']),
                             axis=1))
                df = df.rename(columns={c: '%s score' % c})

        df = df.rename(columns={'Seq': 'sequence'})
        df = df.rename(columns={'Method': 'method'})
        pred_dataframes.append(df)

    # write prediction statistics
    statistics = {
        'prediction_methods': methods,
        'number_of_variants': '-',
        'number_of_peptides': len(peptides),
        'number_of_peptides_after_filtering': len(peptides_filtered)
    }

    return pred_dataframes, statistics
def __main__():
    parser = argparse.ArgumentParser(
        "Write out information about supported models by Fred2 for installed predictor tool versions."
    )
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default=1,
                        help="MHC class I or II")
    parser.add_argument('-l',
                        "--max_length",
                        help="Maximum peptide length",
                        type=int)
    parser.add_argument('-ml',
                        "--min_length",
                        help="Minimum peptide length",
                        type=int)
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True,
                        type=str)
    parser.add_argument('-t',
                        '--tools',
                        help='Tools requested for peptide predictions',
                        required=True,
                        type=str)
    parser.add_argument('-v',
                        '--versions',
                        help='<Required> File with used software versions.',
                        required=True)
    args = parser.parse_args()
    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [(row[0].split()[0], str(row[1]))
                        for row in csv.reader(versions_file, delimiter=":")]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))  # how to handle this?
        # get for each method the corresponding tool version
        methods = {
            method.strip(): version.strip()
            for tool, version in tool_version for method in selected_methods
            if tool.lower() in method.lower()
        }

    # get the alleles
    alleles = [Allele(a) for a in args.alleles.split(";")]

    peptide_lengths = []
    if (args.peptides):
        peptides = read_peptide_input(args.peptides)
        peptide_lengths = set([len(pep) for pep in peptides])
    else:
        peptide_lengths = range(args.min_length, args.max_length + 1)

    with open("model_report.txt", 'w') as output:
        # check if requested tool versions are supported
        for method, version in methods.items():
            if version not in EpitopePredictorFactory.available_methods()[
                    method.lower()]:
                raise ValueError("The specified version " + version + " for " +
                                 method + " is not supported by Fred2.")

        # check if requested alleles are supported
        support_all_alleles = True
        no_allele_support = True
        for a in alleles:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if a not in sorted(predictor.supportedAlleles):
                    output.write("Allele " + convert_allele_back(a) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.")
                support_all_alleles = False
            else:
                no_allele_support = False
        if support_all_alleles:
            output.write(
                "All selected alleles are supported by at least one of the requested tools.\n"
            )
        if no_allele_support:
            output.write(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )

        output.write("\n")
        # check if requested lengths are supported
        support_all_lengths = True
        no_length_support = True
        for l in peptide_lengths:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if l not in sorted(predictor.supportedLength):
                    output.write("Peptide length " + str(l) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.")
                support_all_lengths = False
            else:
                no_length_support = False
        if support_all_lengths:
            output.write(
                "All selected or provided peptide lengths are supported by at least one of the requested tools.\n"
            )
        if no_length_support:
            output.write(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )
Ejemplo n.º 33
0
def __main__():
    parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p', "--peptides", help="File with one peptide per line")
    parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str)
    parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True)
    parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True)
    parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
    parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
    parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
    parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
    parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
    parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
    parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results")
    parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)")
    parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.")
    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier)))
    logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'}
    global transcriptProteinMap
    global transcriptSwissProtMap

    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts, metadata = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logger.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"): 
                    up_db.read_seqs(os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each selected method the corresponding tool version
        methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() }

    for method, version in methods.items():
        if version not in EpitopePredictorFactory.available_methods()[method]:
            raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.")

    # MHC class I or II predictions
    if args.mhcclass is 1:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
    else:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logger.error("No predictions available.")

    # replace method names with method names with version
    # complete_df.replace({'method': methods}, inplace=True)
    complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] )

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    # Change the order (the index) of the columns
    else:
        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))
    
    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
        
    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1)
        complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)

    # write mutated protein sequences to fasta file
    if args.fasta_output:
        with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
            for p in proteins:
                variants = []
                for v in p.vars:
                    variants = variants + p.vars[v]
                c = [x.coding.values() for x in variants]
                cf = list(itertools.chain.from_iterable(c))
                cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
                aas = ','.join([y.aaMutationSyntax for y in set(cf)])
                protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
                protein_outfile.write('{}\n'.format(str(p)))

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)

    statistics['number_of_predictions'] = len(complete_df)
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
    
    logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Ejemplo n.º 34
0
def make_predictions_from_variants(variants_all, methods, alleles, minlength,
                                   maxlength, martsadapter, protein_db,
                                   identifier, metadata, transcriptProteinMap):
    # list for all peptides and filtered peptides
    all_peptides = []
    all_peptides_filtered = []

    # dictionaries for syfpeithi matrices max values and allele mapping
    max_values_matrices = {}
    allele_string_map = {}

    # list to hold dataframes for all predictions
    pred_dataframes = []

    prots = [
        p for p in generator.generate_proteins_from_transcripts(
            generator.generate_transcripts_from_variants(
                variants_all, martsadapter, ID_SYSTEM_USED))
    ]

    for peplen in range(minlength, maxlength):
        peptide_gen = generator.generate_peptides_from_proteins(prots, peplen)

        peptides_var = [x for x in peptide_gen]

        # remove peptides which are not 'variant relevant'
        peptides = [
            x for x in peptides_var if any(
                x.get_variants_by_protein(y) for y in x.proteins.keys())
        ]

        # filter out self peptides
        selfies = [str(p) for p in peptides if protein_db.exists(str(p))]
        filtered_peptides = [p for p in peptides if str(p) not in selfies]

        all_peptides = all_peptides + peptides
        all_peptides_filtered = all_peptides_filtered + filtered_peptides

        results = []

        if len(filtered_peptides) > 0:
            for m in methods:
                try:
                    results.extend([
                        EpitopePredictorFactory(
                            m.split('-')[0],
                            version=m.split('-')[1]).predict(filtered_peptides,
                                                             alleles=alleles)
                    ])
                except:
                    logging.warning(
                        "Prediction for length {length} and allele {allele} not possible with {method}."
                        .format(length=peplen,
                                allele=','.join([str(a) for a in alleles]),
                                method=m))

        if (len(results) == 0):
            continue

        df = results[0].merge_results(results[1:])

        for a in alleles:
            conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype)
            allele_string_map['%s_%s' %
                              (a, peplen)] = '%s_%i' % (conv_allele, peplen)
            max_values_matrices['%s_%i' %
                                (conv_allele, peplen)] = get_matrix_max_score(
                                    conv_allele, peplen)

        df.insert(0, 'length', df.index.map(create_length_column_value))
        df['chr'] = df.index.map(create_variant_chr_column_value)
        df['pos'] = df.index.map(create_variant_pos_column_value)
        df['gene'] = df.index.map(create_gene_column_value)
        df['transcripts'] = df.index.map(create_transcript_column_value)
        df['proteins'] = df.index.map(create_protein_column_value)
        df['variant type'] = df.index.map(create_variant_type_column_value)
        df['synonymous'] = df.index.map(create_variant_syn_column_value)
        df['homozygous'] = df.index.map(create_variant_hom_column_value)
        df['variant details (genomic)'] = df.index.map(
            create_mutationsyntax_genome_column_value)
        df['variant details (protein)'] = df.index.map(
            create_mutationsyntax_column_value)

        # reset index to have index as columns
        df.reset_index(inplace=True)

        for c in df.columns:
            if '*' in str(c):
                idx = df.columns.get_loc(c)
                df.insert(
                    idx + 1, '%s affinity' % c,
                    df.apply(lambda x: create_affinity_values(
                        str(c), int(x['length']), float(x[c]), x['Method'],
                        max_values_matrices, allele_string_map),
                             axis=1))
                df.insert(
                    idx + 2, '%s binder' % c,
                    df.apply(lambda x: create_binder_values(
                        float(x['%s affinity' % c]), x['Method']),
                             axis=1))
                df = df.rename(columns={c: '%s score' % c})
                df['%s score' % c] = df['%s score' %
                                        c].map(lambda x: round(x, 4))

        for c in metadata:
            df[c] = df.apply(lambda row: create_metadata_column_value(row, c),
                             axis=1)

        df = df.rename(columns={'Seq': 'sequence'})
        df = df.rename(columns={'Method': 'method'})
        pred_dataframes.append(df)

    statistics = {
        'prediction_methods': methods,
        'number_of_variants': len(variants_all),
        'number_of_peptides': len(all_peptides),
        'number_of_peptides_after_filtering': len(all_peptides_filtered)
    }

    return pred_dataframes, statistics, all_peptides_filtered
Ejemplo n.º 35
0
def main():
    #Specify CTD interface
    # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them.
    model = argparse.ArgumentParser(description='Process some integers.')

    model.add_argument('-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method'
        )

    model.add_argument('-v',
        '--version',
        type=str,
        default="",
        help='The version of the prediction method'
        )

    model.add_argument('-i',
        '--input',
        type=str,
        required=True,
        help='Path to the input file'
        )

    model.add_argument('-t',
        '--type',
        choices=["fasta","peptide"],
        type=str,
        default="fasta",
        help='The data type of the input (fasta, peptide list)'
        )

    model.add_argument('-l',
        '--length',
        choices=range(8, 18),
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument('-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)'
        )

    model.add_argument('-op',
        '--options',
        type=str,
        default="",
        help="Additional options that get directly past to the tool"
    )

    model.add_argument('-o',
        '--output',
        type=str,
        required=True,
        help='Path to the output file'
        )

    args = model.parse_args()


    #fasta protein
    if args.type == "fasta":
        with open(args.input, 'r') as f:
            first_line = f.readline()
        sep_pos = 1 if first_line.count("|") else 0
        proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos)
        peptides = generate_peptides_from_proteins(proteins, args.length)
    elif args.type == "peptide":
        peptides = read_lines(args.input, in_type=Peptide)
    else:
        sys.stderr.write('Input type not known\n')
        return -1

    #read in alleles
    alleles = read_lines(args.alleles, in_type=Allele)
    if args.version == "":
        result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options)
    else:
        result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles,
                                                                 options=args.options)

    #write to TSV columns sequence method allele-scores...,protein-id/transcript-id
    with open(args.output, "w") as f:
        proteins = "\tAntigen ID" if args.type == "fasta" else ""
        alleles = result.columns
        f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins =  "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else ""
            f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n")

    return 0
Ejemplo n.º 36
0
 def test_wrong_internal_to_external_version(self):
     with self.assertRaises(RuntimeError):
         EpitopePredictorFactory("NetMHC",
                                 version="0.1").predict(self.peptides_mhcI,
                                                        alleles=self.mhcI)
Ejemplo n.º 37
0
class OptiTopeTestCase(unittest.TestCase):
    """
        Unittest for OptiTope
    """

    def setUp(self):
        self.proteins=[]
        self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")]
        self.peptides = [Peptide(p) for p in """SFSIFLLAL
GHRMAWDMM
VYEADDVIL
CFTPSPVVV
FLLLADARV
GPADGMVSK
YLYDHLAPM
GLRDLAVAV
GPTPLLYRL
TWVLVGGVL
IELGGKPAL
LAGGVLAAV
QYLAGLSTL
NFVSGIQYL
VLSDFKTWL
ARPDYNPPL
KLLPRLPGV
RHTPVNSWL
GLYLFNWAV
ALYDVVSTL
RRCRASGVL
WPLLLLLLA
VTYSLTGLW
YFVIFFVAA""".split()]
        self.result= EpitopePredictorFactory("NetMHC").predict(self.peptides, self.alleles)
        self.thresh = {"A*01:01":0,"B*07:02":0,"C*03:01":0}

    def test_selection_without_constraints(self):
        """
        tests if minimal selection withotu additional constraints (except the knapsack capacity) works

        #peptides obtainedn by perfroming optimization with same input and parameters by
        etk.informatik.uni-tuebingen.de/optitope

        :return:
        """
        opt = OptiTope(self.result, self.thresh, k=3, solver="cplex", verbosity=0)
        r =opt.solve()
        self.assertTrue(len(set(str(p) for p in r) - set(["GPTPLLYRL", "QYLAGLSTL", "ALYDVVSTL"])) == 0 )

    def test_allele_cov_constraint(self):
         """
         tests the allele converage constraints

         :return:
         """
         #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")])
         #self.thresh.update({"A*02:01":0,"B*15:01":0})
         self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
         print self.result[self.alleles[0:2]]
         opt = OptiTope(self.result,self.thresh,k=3,solver="cplex",verbosity=1)
         opt.activate_allele_coverage_const(0.8)
         r = opt.solve()
         res_df = self.result.xs(self.result.index.values[0][1], level="Method")
         peps = [str(p) for p in r]

         probs = {"A*01:01":1, "A*02:01":1, "B*07:02":1,"B*15:01":1,"C*03:01":1.0}
         res_df = res_df.loc[peps, :]
         res_df = res_df[[a for a in self.alleles]]
         res_df = res_df[res_df.apply(lambda x: any(x[a] > self.thresh[a.name] for a in self.alleles), axis=1)]

         print res_df.apply(lambda x: sum( x[c]*probs[c.name] for c in res_df.columns),axis=1)

         self.assertTrue(len(set(str(p) for p in r) - set(["ALYDVVSTL", "KLLPRLPGV", "GPTPLLYRL"])) == 0 )

    def test_epitope_conservation_constraint(self):
         import random
         self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles)
         conservation = {}
         print self.result.index.levels[0]
         for e in self.result.index.levels[0]:
             conservation[str(e)] = random.random()
         pt = OptiTope(self.result, self.thresh, k=3, solver="cplex" ,verbosity=1)
         pt.activate_epitope_conservation_const(0.5, conservation=conservation)
         for e in pt.solve():
             print e, conservation[e]
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-in', dest="inf", help='<Required> full path to the input file', required=True)
    parser.add_argument('-out', dest="out", help="<Required> full path to the output file", required=True)
    parser.add_argument('-allele', dest="allele", help="<Required> full path to an allele file, if 'in', allele file will be deduced from in file name", required=True)
    parser.add_argument('-dirallele', dest="dirallele", help="for use with '-allele in', describes full base path to the allele files")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out or options.allele):
        parser.print_help()
        sys.exit(1)

    target_alleles_set = set()
    #Fred2.FileReader.read_lines is broken
    #alleles = FileReader.read_lines(options.allele, type=Allele)
    if options.allele == "in" and options.dirallele:
        if "_W_" not in options.inf:
            print "No class 1 type run detected."
            sys.exit(0)
        af = None
        for sp in options.inf.split("_"):
            if sp.startswith("BD"):
                af = join(options.dirallele, sp.split("-")[1] + ".allele")
        with open(af, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))
    else:
        with open(options.allele, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))

    if not target_alleles_set:
        parser.print_help()
        sys.exit(1)

    ttn = EpitopePredictorFactory('netmhc')

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    pepstr = set()
    for pep in peps:
        for h in pep.getHits():
            #if "decoy" not in h.getMetaValue("target_decoy"):
                unmod = h.getSequence().toUnmodifiedString()
                if 7 < len(unmod) < 12 \
                        and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod:
                    pepstr.add(h.getSequence().toUnmodifiedString())

    es = [Peptide(x) for x in pepstr]

    try:
        preds_n = ttn.predict(es, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the netMHC prediction", options.inf, "what:", str(e)
        sys.exit(1)

    #only max
    preds = dict()
    for index, row in preds_n.iterrows():
        score = row.max() #bigger_is_better
        allele = str(row.idxmax())
        categ = categorize(score)
        seq = row.name[0].tostring()
        if categ:
            preds[seq] = (allele, categ, score)

    npeps = list()
    for pep in peps:
        hits = pep.getHits()
        nhits = list()
        for h in hits:
            if h.getSequence().toUnmodifiedString() in preds:
                x = preds[h.getSequence().toUnmodifiedString()]
                h.setMetaValue('binder', x[0])
                h.setMetaValue(str(x[1]), x[2])
                nhits.append(h)
            else:
                nhits.append(h)
        pep.setHits(nhits)

    f.store(options.out, pros, peps)
Ejemplo n.º 39
0
 def test_single_peptide_input_mhcII(self):
         for m in EpitopePredictorFactory.available_methods():
             model = EpitopePredictorFactory(m)
             if not isinstance(model, AExternalEpitopePrediction):
                 if all(a.name in model.supportedAlleles for a in self.mhcII):
                     res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
Ejemplo n.º 40
0
 def test_epitope_prediction_unsupported_version(self):
     print EpitopePredictorFactory("BIMAS", version="4.0").predict(
         self.peptides_mhcI, self.mhcI)
Ejemplo n.º 41
0
 def test_epitope_prediction_available_methods(self):
     print EpitopePredictorFactory.available_methods()
Ejemplo n.º 42
0
def toplevel_predictor(x):
    predictor = EpitopePredictorFactory("netMHC", version="3.4")
    peps = [Peptide(i) for i in x]
    return predictor.predict(peps)
Ejemplo n.º 43
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'The software is a novel approach to construct epitope-based string-of-beads \
vaccines in optimal order and with sequence-optimized spacers of flexible length \
such that the recovery of contained epitopes is maximized and immunogenicity of \
arising neo-epitopes is reduced.', )

    parser.add_argument('-i',
                        "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)",
                        type=str)

    parser.add_argument(
        '-a',
        "--alleles",
        required=True,
        help=
        "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)",
        type=str)

    #parameters of the model
    parser.add_argument(
        '-l',
        "--max_length",
        default=6,
        type=int,
        help="Specifies the max. length of the spacers (default 6)",
    )

    parser.add_argument(
        '-al',
        "--alpha",
        default=0.99,
        type=float,
        help=
        "Specifies the first-order preference of the user in the model [0,1] (default 0.99)",
    )

    parser.add_argument(
        '-be',
        "--beta",
        default=0.0,
        type=float,
        help=
        "Specifies the second-order preference of the user in the model [0,1] (default 0).",
    )

    parser.add_argument(
        '-cp',
        "--cleavage_prediction",
        default="pcm",
        choices=["pcm", "proteasmm_c", "proteasmm_i"],
        help=
        "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_I]",
        type=str)

    parser.add_argument(
        '-ep',
        "--epitope_prediction",
        default="syfpeithi",
        choices=["syfpeithi", "smm", "smmpmbec", "bimas"],
        help=
        "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]",
        type=str)
    parser.add_argument(
        '-t',
        "--threshold",
        default=20,
        type=float,
        help=
        "Specifies epitope prediction threshold for SYFPEITHI (default 20).",
    )

    parser.add_argument(
        '-o',
        "--output",
        required=True,
        type=str,
        help="Specifies the output file.",
    )

    parser.add_argument(
        '-p',
        "--threads",
        type=int,
        default=1,
        help=
        "Specifies number of threads. If not specified all available logical cpus are used.",
    )
    parser.add_argument(
        '-apx',
        "--approximate",
        action="store_true",
        help=
        "Specifies number of threads. If not specified all available logical cpus are used.",
    )

    args = parser.parse_args()

    #parse input
    peptides = read_lines(args.input)
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in [
            "PCM", "PROTEASMM_C", "PROTEASMM_I"
    ]:
        sys.stderr.write(
            "Specified cleavage predictor is currently not supported. \
                         Please choose either PCM, PROTEASMM_C, or PROTEASMM_I"
        )
        sys.exit(-1)

    if args.epitope_prediction.upper() not in [
            "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"
    ]:
        sys.stderr.write(
            "Specified cleavage predictor is currently not supported. \
                         Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC")
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name: args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,
                                       cl_pred,
                                       epi_pred,
                                       alleles,
                                       k=args.max_length,
                                       en=9,
                                       threshold=thr,
                                       solver="cbc",
                                       alpha=args.alpha,
                                       beta=args.beta,
                                       verbosity=1)
    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    #TODO:CBC should be shipped with the node
    #TODO: has to be tested with CBC
    #TODO: LHK has to be shipped as well -> only academic license!
    #"preprocess":"off", "threads":1}
    threads = mp.cpu_count() if args.threads is None else args.threads
    if args.approximate:
        svbws = solver.approximate(threads=threads,
                                   options={
                                       "preprocess": "off",
                                       "threads": 1
                                   })
        if not svbws:
            svbws = solver.solve(threads=threads,
                                 options={
                                     "preprocess": "off",
                                     "threads": 1
                                 })
    else:
        svbws = solver.solve(threads=threads,
                             options={
                                 "preprocess": "off",
                                 "threads": 1
                             })

    with open(args.output, "w") as f:
        f.write(">assembled_spacer_design\n")
        f.write("".join(map(str, svbws)))
    return 0
Ejemplo n.º 44
0
def main():

    model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.')

    model.add_argument(
        '-m','--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method'
        )


    model.add_argument(
        '-v', '--vcf',
        type=str,
        default=None,
        help='Path to the vcf input file'
        )

    model.add_argument(
        '-t', '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
        )

    model.add_argument(
        '-p','--proteins',
        type=str,
        default=None,
        help='Path to the protein ID input file (in HGNC-ID)'
        )

    model.add_argument(
        '-l','--length',
        choices=range(8, 18),
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument(
        '-a','--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)'
        )

    model.add_argument(
        '-r' ,'--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.'
        )

    model.add_argument(
        '-fINDEL' ,'--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)'
        )

    model.add_argument(
        '-fFS' ,'--filterFSINDEL',
        action="store_true",
        help='Filter frameshift INDELs'
        )

    model.add_argument(
        '-fSNP' ,'--filterSNP',
        action="store_true",
        help='Filter SNPs'
        )

    model.add_argument(
        '-o','--output',
        type=str,
        required=True,
        help='Path to the output file'
        )
    model.add_argument(
        '-etk','--etk',
        action="store_true",
        help=argparse.SUPPRESS
        )

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(lambda x: x.type not in [VariationType.INS,
                                                       VariationType.DEL,
                                                       VariationType.FSDEL,
                                                       VariationType.FSINS], variants)

        if args.filterFSINDEL:
            variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n")
            return -1

        epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()),
                        generate_peptides_from_variants(variants,
                                                int(args.length), martDB, EIdentifierTypes.ENSEMBL))

        for v in variants:
            for trans_id,coding in v.coding.iteritems():
                if coding.geneID!=None:
                   transcript_to_genes[trans_id] = coding.geneID
                else:
                   transcript_to_genes[trans_id] = 'None'



    #else: generate protein sequences from given HGNC IDs and than epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = generate_peptides_from_proteins(proteins, int(args.length))


    #read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles)

    with open(args.output, "w") as f:
        alleles = result.columns
        var_column = " Variants" if args.vcf is not None else ""
        f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]))
            vars_str = ""

            if args.vcf is not None:
                vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id)))
                                                                            for prot_id in p.proteins.iterkeys()
                                          if p.get_variants_by_protein(prot_id)))
            
            f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n")

    if args.etk:
        with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g:
            alleles = result.columns
            g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n")
            for index, row in result.iterrows():
                p = index[0]
                proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]))
                g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n")
    return 0
Ejemplo n.º 45
0
 def test_wrong_allele_input(self):
     with self.assertRaises(ValueError):
         EpitopePredictorFactory("NetMHC").predict(self.mhcI,
                                                   alleles=self.transcript)
Ejemplo n.º 46
0
def main():
    parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i", "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)"
    )
    parser.add_argument("-a", "--alleles",
                        required=True,
                        help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument("-k","--max_length",
                        default=6,
                        type=int,
                        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument("-al","--alpha",
                        default=0.99,
                        type=float,
                        help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)")
    parser.add_argument("-be","--beta",
                        default=0.0,
                        type=float,
                        help="Specifies the second-order preference of the user in the model [0,1] (default 0).")

    parser.add_argument("-cp","--cleavage_prediction",
                        default="PCM",
                        help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument("-ep","--epitope_prediction",
                        default="Syfpeithi",
                        help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument("-thr","--threshold",
                        default=20,
                        type=float,
                        help="Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument("-t", "--threads",
                        type=int,
                        default=None,
                        help="Specifies number of threads. If not specified all available logical cpus are used.")


    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name:args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles,
                                       k=args.max_length,en=9,threshold=thr,
                                       solver="cplex", alpha=args.alpha, beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads
    svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1})

    print
    print "Resulting String-of-Beads: ","-".join(map(str,svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str,svbws)))
Ejemplo n.º 47
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Ejemplo n.º 48
0
def main():
    parser = argparse.ArgumentParser(
        description=
        """The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i",
                        "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)")
    parser.add_argument(
        "-a",
        "--alleles",
        required=True,
        help=
        "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument(
        "-k",
        "--max_length",
        default=6,
        type=int,
        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument(
        "-al",
        "--alpha",
        default=0.99,
        type=float,
        help=
        "Specifies the first-order preference of the user in the model [0,1] (default 0.99)"
    )
    parser.add_argument(
        "-be",
        "--beta",
        default=0.0,
        type=float,
        help=
        "Specifies the second-order preference of the user in the model [0,1] (default 0)."
    )

    parser.add_argument(
        "-cp",
        "--cleavage_prediction",
        default="PCM",
        help=
        "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument(
        "-ep",
        "--epitope_prediction",
        default="Syfpeithi",
        help=
        "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument(
        "-thr",
        "--threshold",
        default=20,
        type=float,
        help=
        "Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument(
        "-t",
        "--threads",
        type=int,
        default=None,
        help=
        "Specifies number of threads. If not specified all available logical cpus are used."
    )

    parser.add_argument(
        "--ips-solver",
        default="cplex",
        choices=["cplex", "cbc"],
        help=
        "Executable name of the IPS solver. Executable needs to be available in PATH."
    )

    parser.add_argument("--tsp-solution",
                        default="approximate",
                        choices=["approximate", "optimal"],
                        help="Type of solution of the TSP")

    parser.add_argument(
        "--random-order",
        action="store_true",
        help=
        "Indicate whether to generate a random ordered string-of-beads polypeptide"
    )

    parser.add_argument(
        "--seed",
        type=int,
        default=1,
        help="Seed for random ordering of string-of-beads polypeptide")

    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in [
            "PCM", "PROTEASMM_C", "PROTEASMM_S"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in [
            "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name: args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,
                                       cl_pred,
                                       epi_pred,
                                       alleles,
                                       k=args.max_length,
                                       en=9,
                                       threshold=thr,
                                       solver=args.ips_solver,
                                       alpha=args.alpha,
                                       beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads

    if args.tsp_solution == "approximate":
        svbws = solver.approximate(threads=threads,
                                   options={
                                       "preprocessing_presolve": "n",
                                       "threads": 1
                                   })
    else:
        svbws = solver.solve(threads=threads,
                             options={
                                 "preprocessing_presolve": "n",
                                 "threads": 1
                             })

    # Generate random ordered string-of-breads, but still uses optimal spacers
    # determined from the above solve function.
    if args.random_order:
        print "Generating a randomly ordered polypeptide"
        random.seed(args.seed)
        random_order_sob = []
        random.shuffle(peptides)
        for i in range(len(peptides)):

            # Break from loop once we hit the last peptide
            if i == len(peptides) - 1:
                random_order_sob.extend([Peptide(str(peptides[i]))])
                break

            left_peptide = str(peptides[i])
            right_peptide = str(peptides[i + 1])
            opt_spacer = solver.spacer[(left_peptide, right_peptide)]

            # Right peptide gets added in the next iteration
            random_order_sob.extend(
                [Peptide(left_peptide),
                 Peptide(opt_spacer)])

        svbws = random_order_sob

    print
    print "Resulting String-of-Beads: ", "-".join(map(str, svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str, svbws)))
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)


parser = argparse.ArgumentParser(description='Call epitope predictors on data.')
requiredNamed = parser.add_argument_group('required arguments')
requiredNamed.add_argument('--predictor', type=str, help='Epitope predictors [see all with --predictor=list]', required=True)
requiredNamed.add_argument('--dataset', type=str, help='Immunogenic dataset [see all with --dataset=list]', required=True)
parser.add_argument('-n', type=int, help='Number of rows to take from dataset')
parser.add_argument('--allele', type=str, help='HLA Type', default=["HLA-A*01:01","HLA-A*02:01","HLA-B*15:01"])

args = parser.parse_args()

all_predictors = [ name for name,version in EpitopePredictorFactory.available_methods().iteritems()]

all_predictors.remove("netmhcstabpan")
all_predictors.remove("netmhc")

if args.predictor == 'list':
	print("Set one of the predictors with --predictor:")
	print(all_predictors)
	print ("""
Details from https://bioinformatics.oxfordjournals.org/content/suppl/2016/02/26/btw113.DC1/S1.pdf
 SYFPEITHI     T-cell epitope  (Rammensee, et al., 1999)
 BIMAS         MHC-I binding   (Parker, et al., 1994)
 SVMHC         MHC-I binding   (Dönnes and Elofsson, 2002)
 ARB           MHC-I binding   (Bui, et al., 2005)
 SMM           MHC-I binding   (Peters and Sette, 2005)
 SMMPMBEC      MHC-I binding   (Kim, et al., 2009)