def test_path_option_and_optional_parameters(self):
     netmhc = EpitopePredictorFactory("NetMHC")
     exe = netmhc.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, path=exe_try, options="--sort")
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"):
                 if any(a.name in mo.supportedAlleles for a in self.mhcII):
                     mo.predict(self.peptides_mhcII, alleles=self.mhcII[0])
                 else:
                     mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
 def test_path_and_optional_parameters_netctl(self):
     netctlpan = EpitopePredictorFactory("NetCTLpan")
     exe = netctlpan.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI,
                                     commad=exe_try,
                                     options="-wt 0.05 -wc 0.225 -ethr 0.5")
Ejemplo n.º 4
0
 def test_path_and_optional_parameters_netctl(self):
     netctlpan = EpitopePredictorFactory("NetCTLpan")
     exe = netctlpan.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI,
                                     commad=exe_try,
                                     options="-wt 0.05 -wc 0.225 -ethr 0.5")
Ejemplo n.º 5
0
    def test_pareto_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name:10000 for a in allele}
        comp = lambda a,b: a <= b

        print ep_pred.predict(self.peptides,alleles=allele)
        #cl_pred, ep_pred, alleles, threshold, comparator, length=9

        assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1)
        r = assembler.solve(eps=1e10, order=(1,0))
        print r
Ejemplo n.º 6
0
 def test_single_peptide_input_mhcII(self):
     for m in EpitopePredictorFactory.available_methods():
         model = EpitopePredictorFactory(m)
         if not isinstance(model, AExternalEpitopePrediction):
             if all(a.name in model.supportedAlleles for a in self.mhcII):
                 res = model.predict(self.peptides_mhcII[0],
                                     alleles=self.mhcII[1])
Ejemplo n.º 7
0
    def est_multiple_peptide_input_mhcI(self):

            for m in EpitopePredictorFactory.available_methods():

                model = EpitopePredictorFactory(m)
                if all( a.name in model.supportedAlleles for a in self.mhcI):
                    res = model.predict(self.peptides_mhcI,alleles=self.mhcI)
Ejemplo n.º 8
0
 def test_path_option_and_optional_parameters_netmhc(self):
     netmhc = EpitopePredictorFactory("NetMHC")
     exe = netmhc.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1)
             self.assertTrue(len(r) == len(self.peptides_mhcI))
             self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None)
             self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
 def test_path_option_and_optional_parameters_netmhc(self):
     netmhc = EpitopePredictorFactory("NetMHC")
     exe = netmhc.command.split()[0]
     for try_path in os.environ["PATH"].split(os.pathsep):
         try_path = try_path.strip('"')
         exe_try = os.path.join(try_path, exe).strip()
         if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK):
             r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1)
             self.assertTrue(len(r) == len(self.peptides_mhcI))
             self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None)
             self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
Ejemplo n.º 10
0
    def test_pareto_assembly(self):
        cl_pred = CleavageSitePredictorFactory("PCM")
        ep_pred = EpitopePredictorFactory("SMM")
        allele = [Allele("HLA-A*02:01")]
        thresh = {a.name: 10000 for a in allele}
        comp = lambda a, b: a <= b

        print ep_pred.predict(self.peptides, alleles=allele)
        #cl_pred, ep_pred, alleles, threshold, comparator, length=9

        assembler = ParetoEpitopeAssembly(self.peptides,
                                          cl_pred,
                                          ep_pred,
                                          allele,
                                          thresh,
                                          comp,
                                          solver="glpk",
                                          verbosity=1)
        r = assembler.solve(eps=1e10, order=(1, 0))
        print r
def run_predictor(pred, dataset):
	predictor = EpitopePredictorFactory(pred)
	results = ()
	try:
		results = predictor.predict(dataset, alleles=[ Allele(a) for a in args.allele ])
		print(results)
		print(results.describe())
	except ValueError:
		pass
	
	return(len(results),len(dataset))
Ejemplo n.º 12
0
def run_predictor(pred, dataset):
    predictor = EpitopePredictorFactory(pred)
    results = ()
    try:
        results = predictor.predict(dataset,
                                    alleles=[Allele(a) for a in args.allele])
        print(results)
        print(results.describe())
    except ValueError:
        pass

    return (len(results), len(dataset))
Ejemplo n.º 13
0
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (
                     mo.version == "0.1" and mo.name == "netmhc"):
                 print "Testing", mo.name, "version", mo.version
                 try:
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcII):
                         mo.predict(self.peptides_mhcII,
                                    alleles=self.mhcII[0])
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcII_combined_alleles):
                         mo.predict(self.peptides_mhcII,
                                    alleles=self.mhcII_combined_alleles[0])
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcI):
                         mo.predict(self.peptides_mhcI,
                                    alleles=self.mhcI[0])
                     print "Success"
                 except RuntimeError as e:  #catch only those stemming from binary unavailability
                     if "could not be found in PATH" not in e.message:
                         raise e  #all others do not except
                     else:
                         print mo.name, "not available"
Ejemplo n.º 14
0
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"):
                 print "Testing", mo.name, "version", mo.version
                 try:
                     if any(a.name in mo.supportedAlleles for a in self.mhcII):
                         mo.predict(self.peptides_mhcII, alleles=self.mhcII[0])
                     if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles):
                         mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0])
                     if any(a.name in mo.supportedAlleles for a in self.mhcI):
                         mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
                     print "Success"
                 except RuntimeError as e: #catch only those stemming from binary unavailability
                     if "could not be found in PATH" not in e.message:
                         raise e #all others do not except
                     else:
                         print mo.name, "not available"
Ejemplo n.º 15
0
def make_predictions_from_variants(variants_all, methods, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap):
    # list for all peptides and filtered peptides
    all_peptides = []
    all_peptides_filtered = []

    # dictionaries for syfpeithi matrices max values and allele mapping
    max_values_matrices = {}
    allele_string_map = {}

    # list to hold dataframes for all predictions
    pred_dataframes = []

    prots = [p for p in generator.generate_proteins_from_transcripts(generator.generate_transcripts_from_variants(variants_all, martsadapter, ID_SYSTEM_USED))]

    for peplen in range(minlength, maxlength):
        peptide_gen = generator.generate_peptides_from_proteins(prots, peplen)

        peptides_var = [x for x in peptide_gen]

        # remove peptides which are not 'variant relevant'
        peptides = [x for x in peptides_var if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]

        # filter out self peptides
        selfies = [str(p) for p in peptides if protein_db.exists(str(p))]
        filtered_peptides = [p for p in peptides if str(p) not in selfies]

        all_peptides = all_peptides + peptides
        all_peptides_filtered = all_peptides_filtered + filtered_peptides

        results = []

        if len(filtered_peptides) > 0:
            for method, version in methods.items():
                try:
                    predictor = EpitopePredictorFactory(method, version=version)
                    results.extend([predictor.predict(filtered_peptides, alleles=alleles)])
                except:
                    logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}.".format(length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version))

        if(len(results) == 0):
            continue

        df = pd.concat(results)

        for a in alleles:
            conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype)
            allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen)
            max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score(conv_allele, peplen)

        df.insert(0, 'length', df.index.map(create_length_column_value))
        df['chr'] = df.index.map(create_variant_chr_column_value)
        df['pos'] = df.index.map(create_variant_pos_column_value)
        df['gene'] = df.index.map(create_gene_column_value)
        df['transcripts'] = df.index.map(create_transcript_column_value)
        df['proteins'] = df.index.map(create_protein_column_value)
        df['variant type'] = df.index.map(create_variant_type_column_value)
        df['synonymous'] = df.index.map(create_variant_syn_column_value)
        df['homozygous'] = df.index.map(create_variant_hom_column_value)
        df['variant details (genomic)'] = df.index.map(create_mutationsyntax_genome_column_value)
        df['variant details (protein)'] = df.index.map(create_mutationsyntax_column_value)

        # reset index to have index as columns
        df.reset_index(inplace=True)

        for c in df.columns:
            if ('HLA-' in str(c)) or ('H-2-' in str(c)):
                idx = df.columns.get_loc(c)
                df.insert(idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values(str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1))
                df.insert(idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values(float(x['%s affinity' % c]), x['Method']), axis=1))
                df = df.rename(columns={c: '%s score' % c})
                df['%s score' % c] = df.apply(lambda x: create_score_values(float(x['%s score' % c]), x['Method']), axis=1)

        for c in metadata:
            df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1)

        df = df.rename(columns={'Seq': 'sequence'})
        df = df.rename(columns={'Method': 'method'})
        pred_dataframes.append(df)

    statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ] ,'number_of_variants': len(variants_all), 'number_of_unique_peptides': [str(p) for p in all_peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in all_peptides_filtered]}

    return pred_dataframes, statistics, all_peptides_filtered, prots
Ejemplo n.º 16
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Ejemplo n.º 17
0
def toplevel_predictor(x):
    predictor = EpitopePredictorFactory("netMHC", version="3.4")
    peps = [Peptide(i) for i in x]
    return predictor.predict(peps)
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-c',
                        dest="mhcclass",
                        help='<Required> MHC class',
                        required=True)
    parser.add_argument('-in',
                        dest="inf",
                        help='<Required> full path to the input file',
                        required=True)
    parser.add_argument('-out',
                        dest="out",
                        help="<Required> full path to the output file",
                        required=True)
    parser.add_argument(
        '-allele',
        dest="allele",
        help=
        "<Required> full path to an allele file, if 'in', allele file will be deduced from in file name",
        required=True)
    parser.add_argument(
        '-dirallele',
        dest="dirallele",
        help=
        "for use with '-allele in', describes full base path to the allele files"
    )

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out or options.allele):
        parser.print_help()
        sys.exit(1)

    target_alleles_set = set()
    #Fred2.FileReader.read_lines is broken
    #alleles = FileReader.read_lines(options.allele, type=Allele)
    if options.allele == "in" and options.dirallele:
        if "_W_" not in options.inf:
            print "No class 1 type run detected."
            sys.exit(0)
        af = None
        for sp in options.inf.split("_"):
            if sp.startswith("BD"):
                af = join(options.dirallele, sp.split("-")[1] + ".allele")
        with open(af, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))
    else:
        with open(options.allele, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))

    if not target_alleles_set:
        parser.print_help()
        sys.exit(1)

    if options.mhcclass == "I":
        ttn = EpitopePredictorFactory('netmhcpan', version='3.0')
        lowerBound = 8
        upperBound = 12
    elif options.mhcclass == "II":
        ttn = EpitopePredictorFactory('netmhcIIpan', version='3.1')
        lowerBound = 15
        upperBound = 25

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    pepstr = set()
    for pep in peps:
        for h in pep.getHits():
            #if "decoy" not in h.getMetaValue("target_decoy"):
            unmod = h.getSequence().toUnmodifiedString()
            if lowerBound <= len(unmod) <= upperBound \
                    and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod:
                pepstr.add(h.getSequence().toUnmodifiedString())

    es = [Peptide(x) for x in pepstr]

    try:
        preds_n = ttn.predict(es, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the netMHC prediction", options.inf, "what:", str(
            e)
        sys.exit(1)

    #only max
    preds = dict()
    for index, row in preds_n.iterrows():
        score = row.max()  #bigger_is_better
        allele = str(row.idxmax())
        categ = categorize(score)
        seq = row.name[0].tostring()
        if categ:
            preds[seq] = (allele, categ, score)

    npeps = list()
    for pep in peps:
        hits = pep.getHits()
        nhits = list()
        for h in hits:
            if h.getSequence().toUnmodifiedString() in preds:
                x = preds[h.getSequence().toUnmodifiedString()]
                h.setMetaValue('binder', x[0])
                h.setMetaValue(str(x[1]), x[2])
                nhits.append(h)
            else:
                nhits.append(h)
        pep.setHits(nhits)

    f.store(options.out, pros, peps)
Ejemplo n.º 19
0
def toplevel_predictor(x):
    predictor = EpitopePredictorFactory("netMHC", version="3.4")
    peps = [Peptide(i) for i in x]
    return predictor.predict(peps)
Ejemplo n.º 20
0
        return matrix_max(
            getattr(
                __import__("Fred2.Data.pssms.syfpeithi" + ".mat." +
                           allele_model,
                           fromlist=[allele_model]), allele_model))
    except ImportError:
        return None


# Calculate the maximum attainable score for each allele
converted_alleles = dict(zip(alleles, predictor.convert_alleles(alleles)))
max_score_by_allele = {(allele, length):
                       load_allele_model(converted_alleles[allele], length)
                       for length in predictor.supportedLength
                       for allele in alleles}

# Run predictions and output results
print 'Peptide\tAllele\tSyfpeithiRawScore\tSyfpeithiNormScore'
for pep_len, peptides in peptides_by_length.items():
    for allele in alleles:
        if (allele, pep_len
            ) in max_score_by_allele and max_score_by_allele[allele, pep_len]:
            results = predictor.predict(peptides, alleles=[allele])
            for index, row in results.iterrows():
                print '{}\t{}\t{}\t{}'.format(
                    str(index[0]), allele, row[allele],
                    float(row[allele]) / max_score_by_allele[allele, pep_len])
        else:
            for peptide in peptides:
                print '{}\t{}\t{}\t{}'.format(peptide, allele, 'NA', 'NA')
Ejemplo n.º 21
0
def make_predictions_from_peptides(peptides, methods, alleles, protein_db, identifier, metadata):
    # dictionaries for syfpeithi matrices max values and allele mapping
    max_values_matrices = {}
    allele_string_map = {}

    # list to hold dataframes for all predictions
    pred_dataframes = []

    # filter out self peptides if specified
    selfies = [str(p) for p in peptides if protein_db.exists(str(p))]
    peptides_filtered = [p for p in peptides if str(p) not in selfies]

    # sort peptides by length (for predictions)
    sorted_peptides = {}

    for p in peptides_filtered:
        length = len(str(p))
        if length in sorted_peptides:
            sorted_peptides[length].append(p)
        else:
            sorted_peptides[length] = [p]

    for peplen in sorted_peptides:
        all_peptides_filtered = sorted_peptides[peplen]
        results = []
        for method, version in methods.items():
            try:
                predictor = EpitopePredictorFactory(method, version=version)
                results.extend([predictor.predict(all_peptides_filtered, alleles=alleles)])
            except:
                logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}. No model available.".format(length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version))

        # merge dataframes of the performed predictions
        if(len(results) == 0):
            continue

        df = pd.concat(results)

        df.insert(0, 'length', df.index.map(create_length_column_value))

        for a in alleles:
            conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype)
            allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen)
            max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score(conv_allele,peplen)

        # reset index to have index as columns
        df.reset_index(inplace=True)

        mandatory_columns = ['chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'synonymous', 'homozygous', 'variant details (genomic)', 'variant details (protein)']

        for header in mandatory_columns:
            if header not in metadata:
                df[header] = np.nan
            else:
                df[header] = df.apply(lambda row: row[0].get_metadata(header)[0], axis=1)

        for c in list(set(metadata) - set(mandatory_columns)):
            df[c] = df.apply(lambda row: row[0].get_metadata(c)[0], axis=1)

        for c in df.columns:
            if ('HLA-' in str(c)) or ('H-2-' in str(c)):
                idx = df.columns.get_loc(c)
                df.insert(idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values(str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1))
                df.insert(idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values(float(x['%s affinity' % c]), x['Method']), axis=1))
                df = df.rename(columns={c: '%s score' % c})
                df['%s score' % c] = df.apply(lambda x: create_score_values(float(x['%s score' % c]), x['Method']), axis=1)

        df = df.rename(columns={'Seq': 'sequence'})
        df = df.rename(columns={'Method': 'method'})
        pred_dataframes.append(df)

    # write prediction statistics
    statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ],'number_of_variants': '-', 'number_of_unique_peptides': [str(p) for p in peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in peptides_filtered]}
    return pred_dataframes, statistics
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-in', dest="inf", help='<Required> full path to the input file', required=True)
    parser.add_argument('-out', dest="out", help="<Required> full path to the output file", required=True)
    parser.add_argument('-allele', dest="allele", help="<Required> full path to an allele file, if 'in', allele file will be deduced from in file name", required=True)
    parser.add_argument('-dirallele', dest="dirallele", help="for use with '-allele in', describes full base path to the allele files")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out or options.allele):
        parser.print_help()
        sys.exit(1)

    target_alleles_set = set()
    #Fred2.FileReader.read_lines is broken
    #alleles = FileReader.read_lines(options.allele, type=Allele)
    if options.allele == "in" and options.dirallele:
        if "_W_" not in options.inf:
            print "No class 1 type run detected."
            sys.exit(0)
        af = None
        for sp in options.inf.split("_"):
            if sp.startswith("BD"):
                af = join(options.dirallele, sp.split("-")[1] + ".allele")
        with open(af, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))
    else:
        with open(options.allele, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))

    if not target_alleles_set:
        parser.print_help()
        sys.exit(1)

    ttn = EpitopePredictorFactory('netmhc')

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    pepstr = set()
    for pep in peps:
        for h in pep.getHits():
            #if "decoy" not in h.getMetaValue("target_decoy"):
                unmod = h.getSequence().toUnmodifiedString()
                if 7 < len(unmod) < 12 \
                        and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod:
                    pepstr.add(h.getSequence().toUnmodifiedString())

    es = [Peptide(x) for x in pepstr]

    try:
        preds_n = ttn.predict(es, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the netMHC prediction", options.inf, "what:", str(e)
        sys.exit(1)

    #only max
    preds = dict()
    for index, row in preds_n.iterrows():
        score = row.max() #bigger_is_better
        allele = str(row.idxmax())
        categ = categorize(score)
        seq = row.name[0].tostring()
        if categ:
            preds[seq] = (allele, categ, score)

    npeps = list()
    for pep in peps:
        hits = pep.getHits()
        nhits = list()
        for h in hits:
            if h.getSequence().toUnmodifiedString() in preds:
                x = preds[h.getSequence().toUnmodifiedString()]
                h.setMetaValue('binder', x[0])
                h.setMetaValue(str(x[1]), x[2])
                nhits.append(h)
            else:
                nhits.append(h)
        pep.setHits(nhits)

    f.store(options.out, pros, peps)
Ejemplo n.º 23
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True)
    parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True)
    parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction")
    parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH))
    parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict")
    parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.")
    parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
                        filemode='w+', level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

# MyObject = type('MyObject', (object,), {})
# options = MyObject()
# setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
#
# vt = os.path.splitext(options.var_file)[-1]
# if ".vcf" == vt:
#     vcfvars, accessions = FileReader.read_vcf(options.var_file)
#
# mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
#
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# transcripts = [x for x in transcript_gen if x.vars]
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
# proteins = [x for x in protein_gen if x.vars]
# for p in proteins:
#     p.gene_id = p.vars.values()[0][0].gene
#
#
# for t in transcripts:
#     t.gene_id = t.vars.values()[0].gene
#

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values()[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Ejemplo n.º 24
0
 def test_single_peptide_input_mhcII(self):
         for m in EpitopePredictorFactory.available_methods():
             model = EpitopePredictorFactory(m)
             if not isinstance(model, AExternalEpitopePrediction):
                 if all(a.name in model.supportedAlleles for a in self.mhcII):
                     res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])