Ejemplo n.º 1
0
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (
                     mo.version == "0.1" and mo.name == "netmhc"):
                 print "Testing", mo.name, "version", mo.version
                 try:
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcII):
                         mo.predict(self.peptides_mhcII,
                                    alleles=self.mhcII[0])
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcII_combined_alleles):
                         mo.predict(self.peptides_mhcII,
                                    alleles=self.mhcII_combined_alleles[0])
                     if any(a.name in mo.supportedAlleles
                            for a in self.mhcI):
                         mo.predict(self.peptides_mhcI,
                                    alleles=self.mhcI[0])
                     print "Success"
                 except RuntimeError as e:  #catch only those stemming from binary unavailability
                     if "could not be found in PATH" not in e.message:
                         raise e  #all others do not except
                     else:
                         print mo.name, "not available"
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"):
                 if any(a.name in mo.supportedAlleles for a in self.mhcII):
                     mo.predict(self.peptides_mhcII, alleles=self.mhcII[0])
                 else:
                     mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
Ejemplo n.º 3
0
 def test_single_peptide_input_mhcII(self):
     for m in EpitopePredictorFactory.available_methods():
         model = EpitopePredictorFactory(m)
         if not isinstance(model, AExternalEpitopePrediction):
             if all(a.name in model.supportedAlleles for a in self.mhcII):
                 res = model.predict(self.peptides_mhcII[0],
                                     alleles=self.mhcII[1])
Ejemplo n.º 4
0
    def est_multiple_peptide_input_mhcI(self):

            for m in EpitopePredictorFactory.available_methods():

                model = EpitopePredictorFactory(m)
                if all( a.name in model.supportedAlleles for a in self.mhcI):
                    res = model.predict(self.peptides_mhcI,alleles=self.mhcI)
Ejemplo n.º 5
0
def valid_predictors(supported_length=9,
                     exclude_predictors=["epidemix", "unitope", "netctlpan"]):
    """
    Get the infomation for all predictors and keep only
    the relevant ones.

    Args:
       supported_length (int): Supported peptide input length.
       exclude_predictors (list of chars): List of methods to remove in addition
    """

    methods = EpitopePredictorFactory.available_methods().keys()
    dt = pd.DataFrame([predictor_info(method) for method in methods])
    n_init = len(dt)

    dt = dt[[supported_length in elems for elems in dt["supportedLength"]]]
    dt = dt[dt["type"].notnull()]  # we should know where it was trained
    dt = dt[dt["is_in_path"].isnull() | dt["is_in_path"]]

    for excl_predictor in exclude_predictors:
        dt = dt[dt["name"] != excl_predictor]

    print("removed {0} methods from Fred2. {1} remain".\
          format(n_init - len(dt), len(dt)))

    return dt
 def test_single_allele_input(self):
     for m in EpitopePredictorFactory.available_methods():
         for v in EpitopePredictorFactory.available_methods()[m]:
             mo = EpitopePredictorFactory(m, version=v)
             if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"):
                 print "Testing", mo.name, "version", mo.version
                 try:
                     if any(a.name in mo.supportedAlleles for a in self.mhcII):
                         mo.predict(self.peptides_mhcII, alleles=self.mhcII[0])
                     if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles):
                         mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0])
                     if any(a.name in mo.supportedAlleles for a in self.mhcI):
                         mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
                     print "Success"
                 except RuntimeError as e: #catch only those stemming from binary unavailability
                     if "could not be found in PATH" not in e.message:
                         raise e #all others do not except
                     else:
                         print mo.name, "not available"
Ejemplo n.º 7
0
def __main__():
    parser = argparse.ArgumentParser(
        "Write out information about supported models by Fred2 for available prediction tool versions."
    )
    parser.add_argument('-v',
                        '--versions',
                        help='File with used software versions.',
                        required=True)
    args = parser.parse_args()

    # NOTE this needs to be updated manually, if other methods should be used in the future
    available_methods = [
        'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2'
    ]
    with open(args.versions, 'r') as versions_file:
        tool_version = [(row[0].split()[0], str(row[1]))
                        for row in csv.reader(versions_file, delimiter=":")]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each method the corresponding tool version
        methods = {
            method.strip(): version.strip()
            for tool, version in tool_version for method in available_methods
            if tool.lower() in method.lower()
        }

    for method, version in methods.items():
        if (version
                not in EpitopePredictorFactory.available_methods()[method]):
            raise ValueError("The specified version " + version + " for " +
                             method + " is not supported by Fred2.")

        predictor = EpitopePredictorFactory(method, version=version)
        with open(method + ".v" + str(version) + ".supported_alleles.txt",
                  'w') as output:
            for a in sorted(predictor.supportedAlleles):
                output.write(convert_allele_back(a) + "\n")
        with open(method + ".v" + str(version) + ".supported_lengths.txt",
                  'w') as output:
            for l in sorted(predictor.supportedLength):
                output.write(str(l) + "\n")
Ejemplo n.º 8
0
def main():

    model = argparse.ArgumentParser(
        description='Neoepitope prediction for TargetInsepctor.')

    model.add_argument(
        '-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method')

    model.add_argument('-v',
                       '--vcf',
                       type=str,
                       default=None,
                       help='Path to the vcf input file')

    model.add_argument(
        '-t',
        '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help=
        'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
    )

    model.add_argument('-p',
                       '--proteins',
                       type=str,
                       default=None,
                       help='Path to the protein ID input file (in HGNC-ID)')

    model.add_argument('-l',
                       '--length',
                       choices=range(8, 18),
                       type=int,
                       default=9,
                       help='The length of peptides')

    model.add_argument(
        '-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)')

    model.add_argument(
        '-r',
        '--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.')

    model.add_argument(
        '-fINDEL',
        '--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)')

    model.add_argument('-fFS',
                       '--filterFSINDEL',
                       action="store_true",
                       help='Filter frameshift INDELs')

    model.add_argument('-fSNP',
                       '--filterSNP',
                       action="store_true",
                       help='Filter SNPs')

    model.add_argument('-o',
                       '--output',
                       type=str,
                       required=True,
                       help='Path to the output file')
    model.add_argument('-etk',
                       '--etk',
                       action="store_true",
                       help=argparse.SUPPRESS)

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write(
            "At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf,
                                                     gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(
                lambda x: x.type not in [
                    VariationType.INS, VariationType.DEL, VariationType.FSDEL,
                    VariationType.FSINS
                ], variants)

        if args.filterFSINDEL:
            variants = filter(
                lambda x: x.type not in
                [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write(
                "No variants left after filtering. Please refine your filtering criteria.\n"
            )
            return -1

        epitopes = filter(
            lambda x: any(
                x.get_variants_by_protein(tid)
                for tid in x.proteins.iterkeys()),
            generate_peptides_from_variants(variants, int(args.length), martDB,
                                            EIdentifierTypes.ENSEMBL))

        for v in variants:
            for trans_id, coding in v.coding.iteritems():
                if coding.geneID != None:
                    transcript_to_genes[trans_id] = coding.geneID
                else:
                    transcript_to_genes[trans_id] = 'None'

    #else: generate protein sequences from given HGNC IDs and than epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(
                    l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(
                    ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[
                        EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(
                        Protein(
                            protein_seq,
                            gene_id=l.strip(),
                            transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = generate_peptides_from_proteins(proteins, int(args.length))

    #read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    result = EpitopePredictorFactory(args.method).predict(epitopes,
                                                          alleles=alleles)

    with open(args.output, "w") as f:
        alleles = result.columns
        var_column = " Variants" if args.vcf is not None else ""
        f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) +
                "\tAntigen ID\t" + var_column + "\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins = ",".join(
                set([
                    transcript_to_genes[prot.transcript_id.split(":FRED2")[0]]
                    for prot in p.get_all_proteins()
                ]))
            vars_str = ""

            if args.vcf is not None:
                vars_str = "\t" + "|".join(
                    set(
                        prot_id.split(":FRED2")[0] + ":" + ",".join(
                            repr(v)
                            for v in set(p.get_variants_by_protein(prot_id)))
                        for prot_id in p.proteins.iterkeys()
                        if p.get_variants_by_protein(prot_id)))

            f.write(
                str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a]
                                                          for a in alleles) +
                "\t" + proteins + vars_str + "\n")

    if args.etk:
        with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g:
            alleles = result.columns
            g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n")
            for index, row in result.iterrows():
                p = index[0]
                proteins = " ".join(
                    set([
                        transcript_to_genes[prot.transcript_id.split(
                            ":FRED2")[0]] for prot in p.get_all_proteins()
                    ]))
                g.write(
                    str(p) + "\t" + "\t".join("%.3f" % row[a]
                                              for a in alleles) + "\t" +
                    proteins + "\n")
    return 0
def __main__():
    parser = argparse.ArgumentParser(
        "Write out information about supported models by Fred2 for installed predictor tool versions."
    )
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default=1,
                        help="MHC class I or II")
    parser.add_argument('-l',
                        "--max_length",
                        help="Maximum peptide length",
                        type=int)
    parser.add_argument('-ml',
                        "--min_length",
                        help="Minimum peptide length",
                        type=int)
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True,
                        type=str)
    parser.add_argument('-t',
                        '--tools',
                        help='Tools requested for peptide predictions',
                        required=True,
                        type=str)
    parser.add_argument('-v',
                        '--versions',
                        help='<Required> File with used software versions.',
                        required=True)
    args = parser.parse_args()
    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [(row[0].split()[0], str(row[1]))
                        for row in csv.reader(versions_file, delimiter=":")]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))  # how to handle this?
        # get for each method the corresponding tool version
        methods = {
            method.strip(): version.strip()
            for tool, version in tool_version for method in selected_methods
            if tool.lower() in method.lower()
        }

    # get the alleles
    alleles = [Allele(a) for a in args.alleles.split(";")]

    peptide_lengths = []
    if (args.peptides):
        peptides = read_peptide_input(args.peptides)
        peptide_lengths = set([len(pep) for pep in peptides])
    else:
        peptide_lengths = range(args.min_length, args.max_length + 1)

    with open("model_report.txt", 'w') as output:
        # check if requested tool versions are supported
        for method, version in methods.items():
            if version not in EpitopePredictorFactory.available_methods()[
                    method.lower()]:
                raise ValueError("The specified version " + version + " for " +
                                 method + " is not supported by Fred2.")

        # check if requested alleles are supported
        support_all_alleles = True
        no_allele_support = True
        for a in alleles:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if a not in sorted(predictor.supportedAlleles):
                    output.write("Allele " + convert_allele_back(a) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.")
                support_all_alleles = False
            else:
                no_allele_support = False
        if support_all_alleles:
            output.write(
                "All selected alleles are supported by at least one of the requested tools.\n"
            )
        if no_allele_support:
            output.write(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )

        output.write("\n")
        # check if requested lengths are supported
        support_all_lengths = True
        no_length_support = True
        for l in peptide_lengths:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if l not in sorted(predictor.supportedLength):
                    output.write("Peptide length " + str(l) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.")
                support_all_lengths = False
            else:
                no_length_support = False
        if support_all_lengths:
            output.write(
                "All selected or provided peptide lengths are supported by at least one of the requested tools.\n"
            )
        if no_length_support:
            output.write(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )
Ejemplo n.º 10
0
def __main__():
    parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p', "--peptides", help="File with one peptide per line")
    parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str)
    parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True)
    parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True)
    parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
    parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
    parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
    parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
    parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
    parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
    parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results")
    parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)")
    parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.")
    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier)))
    logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'}
    global transcriptProteinMap
    global transcriptSwissProtMap

    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts, metadata = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logger.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"): 
                    up_db.read_seqs(os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each selected method the corresponding tool version
        methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() }

    for method, version in methods.items():
        if version not in EpitopePredictorFactory.available_methods()[method]:
            raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.")

    # MHC class I or II predictions
    if args.mhcclass is 1:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
    else:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logger.error("No predictions available.")

    # replace method names with method names with version
    # complete_df.replace({'method': methods}, inplace=True)
    complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] )

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    # Change the order (the index) of the columns
    else:
        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))
    
    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
        
    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1)
        complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)

    # write mutated protein sequences to fasta file
    if args.fasta_output:
        with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
            for p in proteins:
                variants = []
                for v in p.vars:
                    variants = variants + p.vars[v]
                c = [x.coding.values() for x in variants]
                cf = list(itertools.chain.from_iterable(c))
                cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
                aas = ','.join([y.aaMutationSyntax for y in set(cf)])
                protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
                protein_outfile.write('{}\n'.format(str(p)))

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)

    statistics['number_of_predictions'] = len(complete_df)
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
    
    logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Ejemplo n.º 11
0
def main():

    model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.')

    model.add_argument(
        '-m','--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method'
        )


    model.add_argument(
        '-v', '--vcf',
        type=str,
        default=None,
        help='Path to the vcf input file'
        )

    model.add_argument(
        '-t', '--type',
        type=str,
        choices=["VEP", "ANNOVAR", "SNPEFF"],
        default="VEP",
        help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)'
        )

    model.add_argument(
        '-p','--proteins',
        type=str,
        default=None,
        help='Path to the protein ID input file (in HGNC-ID)'
        )

    model.add_argument(
        '-l','--length',
        choices=range(8, 18),
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument(
        '-a','--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)'
        )

    model.add_argument(
        '-r' ,'--reference',
        type=str,
        default='GRCh38',
        help='The reference genome used for varinat annotation and calling.'
        )

    model.add_argument(
        '-fINDEL' ,'--filterINDEL',
        action="store_true",
        help='Filter insertions and deletions (including frameshifts)'
        )

    model.add_argument(
        '-fFS' ,'--filterFSINDEL',
        action="store_true",
        help='Filter frameshift INDELs'
        )

    model.add_argument(
        '-fSNP' ,'--filterSNP',
        action="store_true",
        help='Filter SNPs'
        )

    model.add_argument(
        '-o','--output',
        type=str,
        required=True,
        help='Path to the output file'
        )
    model.add_argument(
        '-etk','--etk',
        action="store_true",
        help=argparse.SUPPRESS
        )

    args = model.parse_args()

    martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()])
    transcript_to_genes = {}

    if args.vcf is None and args.proteins is None:
        sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n")
        return -1

    # if vcf file is given: generate variants and filter them if HGNC IDs ar given
    if args.vcf is not None:
        protein_ids = []
        if args.proteins is not None:
            with open(args.proteins, "r") as f:
                for l in f:
                    l = l.strip()
                    if l != "":
                        protein_ids.append(l)
        if args.type == "VEP":
            variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids)
        elif args.type == "SNPEFF":
            variants = read_vcf(args.vcf)[0]
        else:
            variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids)

        variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants)

        if args.filterSNP:
            variants = filter(lambda x: x.type != VariationType.SNP, variants)

        if args.filterINDEL:
            variants = filter(lambda x: x.type not in [VariationType.INS,
                                                       VariationType.DEL,
                                                       VariationType.FSDEL,
                                                       VariationType.FSINS], variants)

        if args.filterFSINDEL:
            variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants)

        if not variants:
            sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n")
            return -1

        epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()),
                        generate_peptides_from_variants(variants,
                                                int(args.length), martDB, EIdentifierTypes.ENSEMBL))

        for v in variants:
            for trans_id,coding in v.coding.iteritems():
                if coding.geneID!=None:
                   transcript_to_genes[trans_id] = coding.geneID
                else:
                   transcript_to_genes[trans_id] = 'None'



    #else: generate protein sequences from given HGNC IDs and than epitopes
    else:
        proteins = []
        with open(args.proteins, "r") as f:
            for l in f:
                ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0]
                protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID])
                if protein_seq is not None:
                    transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip()
                    proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID]))
        epitopes = generate_peptides_from_proteins(proteins, int(args.length))


    #read in allele list
    alleles = read_lines(args.alleles, in_type=Allele)

    result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles)

    with open(args.output, "w") as f:
        alleles = result.columns
        var_column = " Variants" if args.vcf is not None else ""
        f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]))
            vars_str = ""

            if args.vcf is not None:
                vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id)))
                                                                            for prot_id in p.proteins.iterkeys()
                                          if p.get_variants_by_protein(prot_id)))
            
            f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n")

    if args.etk:
        with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g:
            alleles = result.columns
            g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n")
            for index, row in result.iterrows():
                p = index[0]
                proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]))
                g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n")
    return 0
Ejemplo n.º 12
0
def main():
    #Specify CTD interface
    # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them.
    model = argparse.ArgumentParser(description='Process some integers.')

    model.add_argument('-m',
        '--method',
        type=str,
        choices=EpitopePredictorFactory.available_methods().keys(),
        default="bimas",
        help='The name of the prediction method'
        )

    model.add_argument('-v',
        '--version',
        type=str,
        default="",
        help='The version of the prediction method'
        )

    model.add_argument('-i',
        '--input',
        type=str,
        required=True,
        help='Path to the input file'
        )

    model.add_argument('-t',
        '--type',
        choices=["fasta","peptide"],
        type=str,
        default="fasta",
        help='The data type of the input (fasta, peptide list)'
        )

    model.add_argument('-l',
        '--length',
        choices=range(8, 18),
        type=int,
        default=9,
        help='The length of peptides'
        )

    model.add_argument('-a',
        '--alleles',
        type=str,
        required=True,
        help='Path to the allele file (one per line in new nomenclature)'
        )

    model.add_argument('-op',
        '--options',
        type=str,
        default="",
        help="Additional options that get directly past to the tool"
    )

    model.add_argument('-o',
        '--output',
        type=str,
        required=True,
        help='Path to the output file'
        )

    args = model.parse_args()


    #fasta protein
    if args.type == "fasta":
        with open(args.input, 'r') as f:
            first_line = f.readline()
        sep_pos = 1 if first_line.count("|") else 0
        proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos)
        peptides = generate_peptides_from_proteins(proteins, args.length)
    elif args.type == "peptide":
        peptides = read_lines(args.input, in_type=Peptide)
    else:
        sys.stderr.write('Input type not known\n')
        return -1

    #read in alleles
    alleles = read_lines(args.alleles, in_type=Allele)
    if args.version == "":
        result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options)
    else:
        result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles,
                                                                 options=args.options)

    #write to TSV columns sequence method allele-scores...,protein-id/transcript-id
    with open(args.output, "w") as f:
        proteins = "\tAntigen ID" if args.type == "fasta" else ""
        alleles = result.columns
        f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n")
        for index, row in result.iterrows():
            p = index[0]
            method = index[1]
            proteins =  "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else ""
            f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n")

    return 0
Ejemplo n.º 13
0
 def test_single_peptide_input_mhcII(self):
         for m in EpitopePredictorFactory.available_methods():
             model = EpitopePredictorFactory(m)
             if not isinstance(model, AExternalEpitopePrediction):
                 if all(a.name in model.supportedAlleles for a in self.mhcII):
                     res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
Ejemplo n.º 14
0
 def test_epitope_prediction_available_methods(self):
     print EpitopePredictorFactory.available_methods()
Ejemplo n.º 15
0
 def test_epitope_prediction_available_methods(self):
     print EpitopePredictorFactory.available_methods()
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)


parser = argparse.ArgumentParser(description='Call epitope predictors on data.')
requiredNamed = parser.add_argument_group('required arguments')
requiredNamed.add_argument('--predictor', type=str, help='Epitope predictors [see all with --predictor=list]', required=True)
requiredNamed.add_argument('--dataset', type=str, help='Immunogenic dataset [see all with --dataset=list]', required=True)
parser.add_argument('-n', type=int, help='Number of rows to take from dataset')
parser.add_argument('--allele', type=str, help='HLA Type', default=["HLA-A*01:01","HLA-A*02:01","HLA-B*15:01"])

args = parser.parse_args()

all_predictors = [ name for name,version in EpitopePredictorFactory.available_methods().iteritems()]

all_predictors.remove("netmhcstabpan")
all_predictors.remove("netmhc")

if args.predictor == 'list':
	print("Set one of the predictors with --predictor:")
	print(all_predictors)
	print ("""
Details from https://bioinformatics.oxfordjournals.org/content/suppl/2016/02/26/btw113.DC1/S1.pdf
 SYFPEITHI     T-cell epitope  (Rammensee, et al., 1999)
 BIMAS         MHC-I binding   (Parker, et al., 1994)
 SVMHC         MHC-I binding   (Dönnes and Elofsson, 2002)
 ARB           MHC-I binding   (Bui, et al., 2005)
 SMM           MHC-I binding   (Peters and Sette, 2005)
 SMMPMBEC      MHC-I binding   (Kim, et al., 2009)
Ejemplo n.º 17
0
requiredNamed.add_argument(
    '--dataset',
    type=str,
    help='Immunogenic dataset [see all with --dataset=list]',
    required=True)
parser.add_argument('-n', type=int, help='Number of rows to take from dataset')
parser.add_argument('--allele',
                    type=str,
                    help='HLA Type',
                    default=["HLA-A*01:01", "HLA-A*02:01", "HLA-B*15:01"])

args = parser.parse_args()

all_predictors = [
    name for name, version in
    EpitopePredictorFactory.available_methods().iteritems()
]

all_predictors.remove("netmhcstabpan")
all_predictors.remove("netmhc")

if args.predictor == 'list':
    print("Set one of the predictors with --predictor:")
    print(all_predictors)
    print("""
Details from https://bioinformatics.oxfordjournals.org/content/suppl/2016/02/26/btw113.DC1/S1.pdf
 SYFPEITHI     T-cell epitope  (Rammensee, et al., 1999)
 BIMAS         MHC-I binding   (Parker, et al., 1994)
 SVMHC         MHC-I binding   (Dönnes and Elofsson, 2002)
 ARB           MHC-I binding   (Bui, et al., 2005)
 SMM           MHC-I binding   (Peters and Sette, 2005)