Ejemplo n.º 1
0
 def test_read_lines(self):
     alleles = FileReader.read_lines(self.ale_path, in_type=Allele)
     self.assertEqual(len(alleles), 2)
     self.assertRaises(IOError,
                       FileReader.read_lines,
                       self.ale_no_path,
                       in_type=Allele)
     self.assertRaises(ValueError,
                       FileReader.read_lines,
                       self.ale_zonk_path,
                       in_type=Allele)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i", "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)"
    )
    parser.add_argument("-a", "--alleles",
                        required=True,
                        help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument("-k","--max_length",
                        default=6,
                        type=int,
                        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument("-al","--alpha",
                        default=0.99,
                        type=float,
                        help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)")
    parser.add_argument("-be","--beta",
                        default=0.0,
                        type=float,
                        help="Specifies the second-order preference of the user in the model [0,1] (default 0).")

    parser.add_argument("-cp","--cleavage_prediction",
                        default="PCM",
                        help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument("-ep","--epitope_prediction",
                        default="Syfpeithi",
                        help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument("-thr","--threshold",
                        default=20,
                        type=float,
                        help="Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument("-t", "--threads",
                        type=int,
                        default=None,
                        help="Specifies number of threads. If not specified all available logical cpus are used.")


    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name:args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles,
                                       k=args.max_length,en=9,threshold=thr,
                                       solver="cplex", alpha=args.alpha, beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads
    svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1})

    print
    print "Resulting String-of-Beads: ","-".join(map(str,svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str,svbws)))
Ejemplo n.º 3
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True)
    parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True)
    parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction")
    parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH))
    parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict")
    parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.")
    parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
                        filemode='w+', level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

# MyObject = type('MyObject', (object,), {})
# options = MyObject()
# setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
#
# vt = os.path.splitext(options.var_file)[-1]
# if ".vcf" == vt:
#     vcfvars, accessions = FileReader.read_vcf(options.var_file)
#
# mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
#
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# transcripts = [x for x in transcript_gen if x.vars]
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
# proteins = [x for x in protein_gen if x.vars]
# for p in proteins:
#     p.gene_id = p.vars.values()[0][0].gene
#
#
# for t in transcripts:
#     t.gene_id = t.vars.values()[0].gene
#

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values()[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Ejemplo n.º 4
0
def __main__():
    parser = argparse.ArgumentParser(
        description=
        """EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""",
        version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default="I",
                        help="MHC class I or II")
    parser.add_argument('-l', "--length", help="Maximum peptide length")
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True)
    parser.add_argument(
        '-r',
        "--reference",
        help=
        "Reference, retrieved information will be based on this ensembl version",
        required=False,
        default='GRCh37',
        choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f',
                        "--filter_self",
                        help="Filter peptides against human proteom",
                        required=False,
                        action='store_true')
    parser.add_argument(
        '-wt',
        "--wild_type",
        help="Add wild type sequences of mutated peptides to output",
        required=False,
        action='store_true')
    parser.add_argument('-rp',
                        "--reference_proteome",
                        help="Reference proteome for self-filtering",
                        required=False)
    parser.add_argument('-gr',
                        "--gene_reference",
                        help="List of gene IDs for ID mapping.",
                        required=False)
    parser.add_argument('-pq',
                        "--protein_quantification",
                        help="File with protein quantification values")
    parser.add_argument('-ge',
                        "--gene_expression",
                        help="File with expression analysis results")
    parser.add_argument(
        '-de',
        "--diff_gene_expression",
        help="File with differential expression analysis results (DESeq2)")
    parser.add_argument(
        '-li',
        "--ligandomics_id",
        help=
        "Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run."
    )
    parser.add_argument('-o',
                        "--output_dir",
                        help="All files written will be put in this directory")

    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if args.output_dir is not None:
        try:
            os.chdir(args.output_dir)
            logging.basicConfig(filename=os.path.join(
                args.output_dir, '{}_prediction.log'.format(args.identifier)),
                                filemode='w+',
                                level=logging.DEBUG)
            logging.info("Using provided data directory: {}".format(
                str(args.output_dir)))
        except:
            logging.info("No such directory, using current.")
    else:
        logging.basicConfig(filename='{}_prediction.log'.format(
            args.identifier),
                            filemode='w+',
                            level=logging.DEBUG)
        logging.info("Using current data directory.")

    logging.info("Starting predictions at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {
        'GRCh37': 'http://feb2014.archive.ensembl.org',
        'GRCh38': 'http://dec2016.archive.ensembl.org'
    }
    global transcriptProteinMap
    global transcriptSwissProtMap
    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith(
                '.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(
            ID_SYSTEM_USED, transcripts, references[args.reference],
            args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logging.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"):
                    up_db.read_seqs(
                        os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    # MHC class I or II predictions
    if args.mhcclass == "I":
        #methods = ['netmhc-4.0', 'syfpeithi-1.0', 'netmhcpan-3.0']
        methods = ['syfpeithi-1.0']
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(
                peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(
                vl, methods, alleles, 8,
                int(args.length) + 1, ma, up_db, args.identifier, metadata,
                transcriptProteinMap)
    else:
        methods = ['netmhcII-2.2', 'syfpeithi-1.0', 'netmhcIIpan-3.1']
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(
                peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(
                vl, methods, alleles, 15,
                int(args.length) + 1, ma, up_db, args.identifier, metadata,
                transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logging.error("No predictions available.")

    # store version of used methods
    method_map = {}
    for m in methods:
        method_map[m.split('-')[0]] = m

    # replace method names with method names with version
    complete_df.replace({'method': method_map}, inplace=True)

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(
            lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = [
            'sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene',
            'transcripts', 'proteins', 'variant type', 'method'
        ]
    # Change the order (the index) of the columns
    else:
        columns_tiles = [
            'sequence', 'length', 'chr', 'pos', 'gene', 'transcripts',
            'proteins', 'variant type', 'method'
        ]
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))

    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(
                k)] = complete_df.apply(
                    lambda row: create_quant_column_value_for_result(
                        row, protein_quant, transcriptSwissProtMap, k),
                    axis=1)

    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(
            lambda row: create_expression_column_value_for_result(
                row, fold_changes, deseq, gene_id_lengths),
            axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(
            lambda row: create_expression_column_value_for_result(
                row, fold_changes, deseq, gene_id_lengths),
            axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(
            lambda row: create_ligandomics_column_value_for_result(
                row, lig_id, 0, False),
            axis=1)
        complete_df['ligand intensity'] = complete_df.apply(
            lambda row: create_ligandomics_column_value_for_result(
                row, lig_id, 1, False),
            axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(
                lambda row: create_ligandomics_column_value_for_result(
                    row, lig_id, 0, True),
                axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(
                lambda row: create_ligandomics_column_value_for_result(
                    row, lig_id, 1, True),
                axis=1)

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier),
                       '\t',
                       index=False)

    statistics['number_of_predictions'] = complete_df.shape[0]
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(
        set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
Ejemplo n.º 5
0
def __main__():
    parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p', "--peptides", help="File with one peptide per line")
    parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str)
    parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True)
    parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True)
    parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
    parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
    parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
    parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
    parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
    parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
    parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results")
    parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)")
    parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.")
    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier)))
    logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'}
    global transcriptProteinMap
    global transcriptSwissProtMap

    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts, metadata = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logger.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"): 
                    up_db.read_seqs(os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each selected method the corresponding tool version
        methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() }

    for method, version in methods.items():
        if version not in EpitopePredictorFactory.available_methods()[method]:
            raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.")

    # MHC class I or II predictions
    if args.mhcclass is 1:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
    else:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logger.error("No predictions available.")

    # replace method names with method names with version
    # complete_df.replace({'method': methods}, inplace=True)
    complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] )

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    # Change the order (the index) of the columns
    else:
        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))
    
    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
        
    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1)
        complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)

    # write mutated protein sequences to fasta file
    if args.fasta_output:
        with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
            for p in proteins:
                variants = []
                for v in p.vars:
                    variants = variants + p.vars[v]
                c = [x.coding.values() for x in variants]
                cf = list(itertools.chain.from_iterable(c))
                cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
                aas = ','.join([y.aaMutationSyntax for y in set(cf)])
                protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
                protein_outfile.write('{}\n'.format(str(p)))

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)

    statistics['number_of_predictions'] = len(complete_df)
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
    
    logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Ejemplo n.º 6
0
def __main__():
    parser = argparse.ArgumentParser(
        "Write out information about supported models by Fred2 for installed predictor tool versions."
    )
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default=1,
                        help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True)
    parser.add_argument('-t',
                        '--tools',
                        help='Tools requested for peptide predictions',
                        required=True,
                        type=str)
    parser.add_argument('-v',
                        '--versions',
                        help='<Required> File with used software versions.',
                        required=True)
    args = parser.parse_args()

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [(row[0], str(row[1][1:]))
                        for row in csv.reader(versions_file, delimiter="\t")]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))  # how to handle this?
        # get for each method the corresponding tool version
        methods = {
            method: version
            for tool, version in tool_version for method in selected_methods
            if tool.lower() in method.lower()
        }

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    peptide_lengths = []
    if (args.peptides):
        peptides = read_peptide_input(args.peptides)
        peptide_lengths = set([len(pep) for pep in peptides])
    else:
        peptide_lengths = range(args.min_length, args.max_length + 1)

    with open("model_report.txt", 'w') as output:
        # check if requested tool versions are supported
        for method, version in methods.items():
            if version not in EpitopePredictorFactory.available_methods(
            )[method]:
                raise ValueError("The specified version " + version + " for " +
                                 method + " is not supported by Fred2.")

        # check if reuested alleles are supported
        support_all_alleles = True
        no_allele_support = True
        for a in alleles:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if a not in sorted(predictor.supportedAlleles):
                    output.write("Allele " + convert_allele_back(a) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.")
                support_all_alleles = False
            else:
                no_allele_support = False
        if support_all_alleles:
            output.write(
                "All selected alleles are supported by at least one of the requested tools.\n"
            )
        if no_allele_support:
            output.write(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )

        output.write("\n")
        # check if reuested lengths are supported
        support_all_lengths = True
        no_length_support = True
        for l in peptide_lengths:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if l not in sorted(predictor.supportedLength):
                    output.write("Peptide length " + str(l) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.")
                support_all_lengths = False
            else:
                no_length_support = False
        if support_all_lengths:
            output.write(
                "All selected or provided peptide lengths are supported by at least one of the requested tools.\n"
            )
        if no_length_support:
            output.write(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )
Ejemplo n.º 7
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i", "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)"
                        )
    parser.add_argument("-a", "--alleles",
                        required=True,
                        help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
                        )

    #parameters of the model
    parser.add_argument("-k","--max_length",
                        default=6,
                        type=int,
                        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument("-al","--alpha",
                        default=0.99,
                        type=float,
                        help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)")
    parser.add_argument("-be","--beta",
                        default=0.0,
                        type=float,
                        help="Specifies the second-order preference of the user in the model [0,1] (default 0).")
    parser.add_argument("-thr","--threshold",
                        default=20,
                        type=float,
                        help="Specifies epitope prediction threshold for SYFPEITHI (default 20).")


    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument("-t", "--threads",
    					type=int,
    					default=None,
                        help="Specifies number of threads. If not specified all available logical cpus are used.")


    args = parser.parse_args()


  	#parse input
    peptides = FileReader.read_lines(args.input, type="Peptide")
    #read in alleles
    alleles = generate_alleles(args.alleles)


    #set-up model
    cl_pred = CleavageSitePredictorFactory("PCM")
    epi_pred = EpitopePredictorFactory("Syfpeithi")

    thr = {a.name:args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles,
                                      k=args.max_length,en=9,threshold=thr,
                                      solver="cplex", alpha=args.alpha, beta=args.beta,
                                      verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads
    svbws = solver.approximate(threads=threads,options="preprocessing_presolve=n,threads=1")

    print
    print "Resulting String-of-Beads: ","-".join(map(str,svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str,svbws)))
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+
                                                 "for a specified prediction method and HLA alleles.")
    parser.add_argument("-i", "--input",
                        nargs="+",
                        required=True,
                        help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line),"
                             +" or peptide sequences as sequences (max 50)"
                        )
    input_types = parser.add_mutually_exclusive_group(required=True)
    input_types.add_argument("-r","--refseq",
                             action="store_true",
                             help= "Specifies the input as RefSeq IDs")
    input_types.add_argument("-u","--uniprot",
                             action="store_true",
                             help= "Specifies the input as UniProt IDs")
    input_types.add_argument("-f","--fasta",
                             action="store_true",
                             help= "Specifies the input as protein (multi-)Fasta file")
    input_types.add_argument("-pf","--pepfile",
                             action="store_true",
                             help= "Specifies the input as peptide file")
    input_types.add_argument("-p","--peptide",
                             action="store_true",
                             help= "Specifies the input as peptide sequences")
    parser.add_argument("-a", "--alleles",
                        nargs="+",
                        required=True,
                        help="Specifies for which alleles prediction should be made. " +
                             "Input either can be alleles as string (new nomenclature), or a file with one allele per line.")
    allele_types = parser.add_mutually_exclusive_group(required=True)
    allele_types.add_argument("-af", "--allelefile",
                               action="store_true",
                               help="Specifies the allele input as allele file.")
    allele_types.add_argument("-as", "--allelestring",
                               action="store_true",
                               help="Specifies the allele input as allele string.")
    parser.add_argument("-m", "--method",
                       required=True,
                       nargs="+",
                       help="Specifies the method used for prediction.")
    parser.add_argument("-l", "--length",
                        required=False,
                        type=int,
                        default=9,
                        help="Specifies the length of the peptides (default=9).")
    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output path. Results will be written to CSV")
    parser.add_argument("-am", "--available",
                        required=False,
                        action="store_true",
                        help="Returns all available methods and their allele models.")

    #COMMENT: These options are hidden and only used for ETK2
    parser.add_argument("-html", "--html",
                        required=False,
                        action="store_true",
                        help=argparse.SUPPRESS)
    parser.add_argument("-od", "--outdir",
                        required=False,
                        default="",
                        help=argparse.SUPPRESS)
    args = parser.parse_args()

    if args.available:
        for pred, obj in AEpitopePrediction.registry.iteritems():
            if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]:
                print "Method: ",pred
                print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" ))
                print "Supported Length: ", " ".join(map(str, getattr(obj,  "_"+pred+"__supported_length")))
                print
        sys.exit(0)


    '''
    Parser Input
    '''
    #RefSeq
    if args.refseq:
        pass

    #UniProt
    elif args.uniprot:
        pass

    #fasta protein
    elif args.fasta:
        proteins = FileReader.read_fasta(args.input, type="Protein")
        peptides = generate_peptides_from_protein(proteins, args.length)

    elif args.pepfile:
        peptides = FileReader.read_lines(args.input, type="Peptide")

    elif args.peptide:
        peptides = [Peptide(s) for s in args.input]

    #read in alleles
    if args.allelefile:
        alleles = FileReader.read_lines(args.alleles, type="Allele")
    else:
        alleles = [Allele(a.upper()) for a in args.alleles]

    result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method]
    r_df = result.pop()
    for r in result:
        r_df_a, r_a = r_df.align(r, fill_value=0)
        r_df = r_df_a + r_a

    output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output)
    with open(output, "w") as out:
        r_df.to_csv(out)



    #generate Galaxy HTML output
    if args.html:
        begin_html = """<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" />
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script>
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script>
    <script type="text/javascript" src="/static/scripts/libs/etk.js"></script>
</head>

<body>
    <div class="document">"""

        setting = """  <h2 class="etk-heading">Epitope Prediction Results</h2>

        <table class="etk-parameterT">
            <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr>
            <tr>
                <th>Prediction Method:</th>
                <td>%s</td>
            </tr>
        </table>"""%args.method



        table="""

        <input id="etk-search" placeholder="  filter">
        <table class="etk-sortT etk-resultsT etk-filterT">

            <thead>
                <tr>
                    <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \
            +"""
                </tr>
            </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c])))
                                for r in result.index for c in result.columns)+"</table>"

        end_html = "</div></body></html>"

        html_out = ".".join(output.split(".")[:-1])+".html"
        with open(html_out, "w") as html_o:
            html_o.write(begin_html+setting+table+end_html)
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description=
        """The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i",
                        "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)")
    parser.add_argument(
        "-a",
        "--alleles",
        required=True,
        help=
        "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument(
        "-k",
        "--max_length",
        default=6,
        type=int,
        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument(
        "-al",
        "--alpha",
        default=0.99,
        type=float,
        help=
        "Specifies the first-order preference of the user in the model [0,1] (default 0.99)"
    )
    parser.add_argument(
        "-be",
        "--beta",
        default=0.0,
        type=float,
        help=
        "Specifies the second-order preference of the user in the model [0,1] (default 0)."
    )

    parser.add_argument(
        "-cp",
        "--cleavage_prediction",
        default="PCM",
        help=
        "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument(
        "-ep",
        "--epitope_prediction",
        default="Syfpeithi",
        help=
        "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument(
        "-thr",
        "--threshold",
        default=20,
        type=float,
        help=
        "Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument(
        "-t",
        "--threads",
        type=int,
        default=None,
        help=
        "Specifies number of threads. If not specified all available logical cpus are used."
    )

    parser.add_argument(
        "--ips-solver",
        default="cplex",
        choices=["cplex", "cbc"],
        help=
        "Executable name of the IPS solver. Executable needs to be available in PATH."
    )

    parser.add_argument("--tsp-solution",
                        default="approximate",
                        choices=["approximate", "optimal"],
                        help="Type of solution of the TSP")

    parser.add_argument(
        "--random-order",
        action="store_true",
        help=
        "Indicate whether to generate a random ordered string-of-beads polypeptide"
    )

    parser.add_argument(
        "--seed",
        type=int,
        default=1,
        help="Seed for random ordering of string-of-beads polypeptide")

    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in [
            "PCM", "PROTEASMM_C", "PROTEASMM_S"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in [
            "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name: args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,
                                       cl_pred,
                                       epi_pred,
                                       alleles,
                                       k=args.max_length,
                                       en=9,
                                       threshold=thr,
                                       solver=args.ips_solver,
                                       alpha=args.alpha,
                                       beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads

    if args.tsp_solution == "approximate":
        svbws = solver.approximate(threads=threads,
                                   options={
                                       "preprocessing_presolve": "n",
                                       "threads": 1
                                   })
    else:
        svbws = solver.solve(threads=threads,
                             options={
                                 "preprocessing_presolve": "n",
                                 "threads": 1
                             })

    # Generate random ordered string-of-breads, but still uses optimal spacers
    # determined from the above solve function.
    if args.random_order:
        print "Generating a randomly ordered polypeptide"
        random.seed(args.seed)
        random_order_sob = []
        random.shuffle(peptides)
        for i in range(len(peptides)):

            # Break from loop once we hit the last peptide
            if i == len(peptides) - 1:
                random_order_sob.extend([Peptide(str(peptides[i]))])
                break

            left_peptide = str(peptides[i])
            right_peptide = str(peptides[i + 1])
            opt_spacer = solver.spacer[(left_peptide, right_peptide)]

            # Right peptide gets added in the next iteration
            random_order_sob.extend(
                [Peptide(left_peptide),
                 Peptide(opt_spacer)])

        svbws = random_order_sob

    print
    print "Resulting String-of-Beads: ", "-".join(map(str, svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str, svbws)))
Ejemplo n.º 11
0
 def test_read_lines(self):
     alleles = FileReader.read_lines(self.ale_path, in_type=Allele)
     self.assertEqual(len(alleles), 2)
     self.assertRaises(IOError, FileReader.read_lines, self.ale_no_path, in_type=Allele)
     self.assertRaises(ValueError, FileReader.read_lines, self.ale_zonk_path, in_type=Allele)