def __main__(): parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier))) logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts, metadata = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logger.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs(os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each selected method the corresponding tool version methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # MHC class I or II predictions if args.mhcclass is 1: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logger.error("No predictions available.") # replace method names with method names with version # complete_df.replace({'method': methods}, inplace=True) complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] ) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) # write mutated protein sequences to fasta file if args.fasta_output: with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) protein_outfile.write('{}\n'.format(str(p))) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = len(complete_df) statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out) logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def __main__(): parser = argparse.ArgumentParser( description= """EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default="I", help="MHC class I or II") parser.add_argument('-l', "--length", help="Maximum peptide length") parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument( '-r', "--reference", help= "Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument( '-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument( '-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument( '-li', "--ligandomics_id", help= "Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run." ) parser.add_argument('-o', "--output_dir", help="All files written will be put in this directory") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if args.output_dir is not None: try: os.chdir(args.output_dir) logging.basicConfig(filename=os.path.join( args.output_dir, '{}_prediction.log'.format(args.identifier)), filemode='w+', level=logging.DEBUG) logging.info("Using provided data directory: {}".format( str(args.output_dir))) except: logging.info("No such directory, using current.") else: logging.basicConfig(filename='{}_prediction.log'.format( args.identifier), filemode='w+', level=logging.DEBUG) logging.info("Using current data directory.") logging.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = { 'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org' } global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith( '.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts( ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logging.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs( os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) # MHC class I or II predictions if args.mhcclass == "I": #methods = ['netmhc-4.0', 'syfpeithi-1.0', 'netmhcpan-3.0'] methods = ['syfpeithi-1.0'] if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides( peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants( vl, methods, alleles, 8, int(args.length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: methods = ['netmhcII-2.2', 'syfpeithi-1.0', 'netmhcIIpan-3.1'] if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides( peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants( vl, methods, alleles, 15, int(args.length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logging.error("No predictions available.") # store version of used methods method_map = {} for m in methods: method_map[m.split('-')[0]] = m # replace method names with method names with version complete_df.replace({'method': method_map}, inplace=True) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply( lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = [ 'sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method' ] # Change the order (the index) of the columns else: columns_tiles = [ 'sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method' ] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format( k)] = complete_df.apply( lambda row: create_quant_column_value_for_result( row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply( lambda row: create_expression_column_value_for_result( row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply( lambda row: create_expression_column_value_for_result( row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 1, True), axis=1) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = complete_df.shape[0] statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list( set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out)
def main(): parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+ "for a specified prediction method and HLA alleles.") parser.add_argument("-i", "--input", nargs="+", required=True, help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line)," +" or peptide sequences as sequences (max 50)" ) input_types = parser.add_mutually_exclusive_group(required=True) input_types.add_argument("-p","--protein", action="store_true", help= "Specifies if IDs are protein IDs") input_types.add_argument("-r","--rna", action="store_true", help= "Specifies the input as rna IDs") input_types.add_argument("-f","--fasta", action="store_true", help= "Specifies the input as protein (multi-)Fasta file") parser.add_argument("-a", "--alleles", nargs="+", required=True, help="Specifies for which alleles prediction should be made. " + "Input either can be alleles as string (new nomenclature), or a file with one allele per line.") allele_types = parser.add_mutually_exclusive_group(required=True) allele_types.add_argument("-af", "--allelefile", action="store_true", help="Specifies the allele input as allele file.") allele_types.add_argument("-as", "--allelestring", action="store_true", help="Specifies the allele input as allele string.") parser.add_argument("-m", "--method", required=True, nargs="+", help="Specifies the method used for prediction.") parser.add_argument("-l", "--length", required=False, type=int, default=9, help="Specifies the length of the peptides (default=9).") parser.add_argument("-o", "--output", required=True, help="Specifies the output path. Results will be written to CSV") parser.add_argument("-am", "--available", required=False, action="store_true", help="Returns all available methods and their allele models.") #COMMENT: These options are hidden and only used for ETK2 parser.add_argument("-html", "--html", required=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-od", "--outdir", required=False, default="", help=argparse.SUPPRESS) args = parser.parse_args() if args.available: for pred, obj in AEpitopePrediction.registry.iteritems(): if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]: print "Method: ",pred print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" )) print "Supported Length: ", " ".join(map(str, getattr(obj, "_"+pred+"__supported_length"))) print sys.exit(0) mart = MartsAdapter() #RefSeq transcripts = [] if args.protein: for pid in args.input: ids = mart.get_transcript_information_from_protein_id(pid) #UniProt elif args.uniprot: pass
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def test_MartsAdapter(self): ma = MartsAdapter(biomart="http://grch37.ensembl.org") self.assertEqual((1515, 1529), ma.get_transcript_position('ENST00000361221', '17953929', '17953943', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone(ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL)) #logging.captureWarnings(True) #result = ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL) self.assertEqual("TP53", ma.get_gene_by_position(17, 7566927, 7566927)) self.assertIsNone(ma.get_product_sequence("Q15942", type=EIdentifierTypes.UNIPROT)) self.assertEqual(self.NP_001005353, ma.get_product_sequence("NP_001005353", type=EIdentifierTypes.REFSEQ)) self.assertEqual(self.ENSP00000369497, ma.get_product_sequence("ENSP00000369497", type=EIdentifierTypes.ENSEMBL)) self.assertEqual(self.ENST00000361221[2], ma.get_transcript_sequence('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone(ma.get_transcript_sequence("ENST00000614237", type=EIdentifierTypes.ENSEMBL)) self.assertDictEqual(self.ENST00000361221, ma.get_transcript_information('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone(ma.get_transcript_information("ENST00000614237", type=EIdentifierTypes.ENSEMBL)) self.assertEqual(str(ma.get_ensembl_ids_from_id('TP53', type=EIdentifierTypes.GENENAME)), "[{0: 'ENSG00000141510', 1: '-', 3: 'ENST00000413465', 4: 'ENSP00000410739'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000359597', 4: 'ENSP00000352610'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000504290', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000510385', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000504937', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000269305', 4: 'ENSP00000269305'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000455263', 4: 'ENSP00000398846'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000420246', 4: 'ENSP00000391127'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000445888', 4: 'ENSP00000391478'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000576024', 4: 'ENSP00000458393'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000509690', 4: 'ENSP00000425104'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000514944', 4: 'ENSP00000423862'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000574684', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000505014', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000508793', 4: 'ENSP00000424104'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000604348', 4: 'ENSP00000473895'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000503591', 4: 'ENSP00000426252'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t8', 4: 'LRG_321p8'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t7', 4: 'LRG_321p13'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t6', 4: 'LRG_321p12'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t5', 4: 'LRG_321p11'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t4', 4: 'LRG_321p10'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t3', 4: 'LRG_321p3'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t2', 4: 'LRG_321p2'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t1', 4: 'LRG_321p1'}]")
def test_MartsAdapter(self): ma = MartsAdapter(biomart="http://grch37.ensembl.org") self.assertEqual( (1515, 1529), ma.get_transcript_position('ENST00000361221', '17953929', '17953943', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone( ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL)) #logging.captureWarnings(True) #result = ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL) self.assertEqual("TP53", ma.get_gene_by_position(17, 7566927, 7566927)) self.assertIsNone( ma.get_product_sequence("Q15942", type=EIdentifierTypes.UNIPROT)) self.assertEqual( self.NP_001005353, ma.get_product_sequence("NP_001005353", type=EIdentifierTypes.REFSEQ)) self.assertEqual( self.ENSP00000369497, ma.get_product_sequence("ENSP00000369497", type=EIdentifierTypes.ENSEMBL)) self.assertEqual( self.ENST00000361221[2], ma.get_transcript_sequence('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone( ma.get_transcript_sequence("ENST00000614237", type=EIdentifierTypes.ENSEMBL)) self.assertDictEqual( self.ENST00000361221, ma.get_transcript_information('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone( ma.get_transcript_information("ENST00000614237", type=EIdentifierTypes.ENSEMBL))
def __main__(): parser = argparse.ArgumentParser( description= """Individualized Proteins 2.0 \n Script for generation of protein sequences which contain provided variants.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument( '-i', "--identifier", help="<Required> Predictions will be written with this name prefix", required=True) parser.add_argument( '-r', "--reference", help= "Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument( '-db', "--database", help= "Proteome sequence reference database to be attached to individualized sequences", required=True) parser.add_argument('-o', "--output_dir", help="All files written will be put in this directory") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logging.basicConfig(filename=os.path.join( args.output_dir, '{}_indproteinsDB.log'.format(args.identifier)), filemode='w+', level=logging.DEBUG) logging.info("Starting generation of protein sequences at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) if args.output_dir is not None: try: os.chdir(args.output_dir) logging.info("Using provided data directory: {}".format( str(args.output_dir))) except: logging.info("No such directory, using current.") else: logging.info("Using current data directory.") '''start the actual IRMA functions''' metadata = [] #references = {'GRCh37': 'http://grch37.ensembl.org', 'GRCh38': 'http://ensembl.org'} references = { 'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org' } global transcriptProteinMap '''read in variants''' if args.somatic_mutations.endswith( '.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts = read_vcf(args.somatic_mutations) if args.germline_mutations is not None: if args.germline_mutations.endswith( '.GSvar') or args.germline_mutations.endswith('.tsv'): vl_normal, transcripts_germline, metadata = read_GSvar( args.germline_mutations) elif args.germline_mutations.endswith('.vcf'): vl_normal, transcripts_germline = read_vcf(args.germline_mutations) # combine germline and somatic variants vl = vl + vl_normal transcripts = transcripts_germline + transcripts transcripts = list(set(transcripts)) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) #generate transcripts containing variants, filter for unmutated sequences transcripts = [ g for g in generator.generate_transcripts_from_variants( vl, ma, ID_SYSTEM_USED) if g.vars ] #generate proteins from transcripts, table='Standard', stop_symbol='*', to_stop=True, cds=False proteins = generator.generate_proteins_from_transcripts(transcripts) diff_sequences = {} out_ref = args.database.split('/')[-1].replace( '.fasta', '_{}_individualized_protein_DB.fasta'.format(args.identifier)) cpRef = 'cp {f} {o}'.format(f=args.database, o=out_ref) subprocess.call(cpRef.split()) with open(out_ref, 'a') as outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) outfile.write('>{}:{}\n'.format(p.transcript_id, aas)) outfile.write('{}\n'.format(str(p))) logging.info("Finished generation of protein sequences at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def test_MartsAdapter(self): ma = MartsAdapter(biomart="http://grch37.ensembl.org") self.assertEqual((1515, 1529), ma.get_transcript_position('ENST00000361221', '17953929', '17953943', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone(ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL)) #logging.captureWarnings(True) #result = ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL) self.assertEqual("TP53", ma.get_gene_by_position(17, 7566927, 7566927)) self.assertIsNone(ma.get_product_sequence("Q15942", type=EIdentifierTypes.UNIPROT)) self.assertEqual(self.NP_001005353, ma.get_product_sequence("NP_001005353", type=EIdentifierTypes.REFSEQ)) self.assertEqual(self.ENSP00000369497, ma.get_product_sequence("ENSP00000369497", type=EIdentifierTypes.ENSEMBL)) self.assertEqual(self.ENST00000361221[2], ma.get_transcript_sequence('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone(ma.get_transcript_sequence("ENST00000614237", type=EIdentifierTypes.ENSEMBL)) self.assertDictEqual(self.ENST00000361221, ma.get_transcript_information('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone(ma.get_transcript_information("ENST00000614237", type=EIdentifierTypes.ENSEMBL))
def test_MartsAdapter(self): ma = MartsAdapter(biomart="http://grch37.ensembl.org") self.assertEqual( (1515, 1529), ma.get_transcript_position('ENST00000361221', '17953929', '17953943', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone( ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL)) #logging.captureWarnings(True) #result = ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL) self.assertEqual("TP53", ma.get_gene_by_position(17, 7566927, 7566927)) self.assertIsNone( ma.get_product_sequence("Q15942", type=EIdentifierTypes.UNIPROT)) self.assertEqual( self.NP_001005353, ma.get_product_sequence("NP_001005353", type=EIdentifierTypes.REFSEQ)) self.assertEqual( self.ENSP00000369497, ma.get_product_sequence("ENSP00000369497", type=EIdentifierTypes.ENSEMBL)) self.assertEqual( self.ENST00000361221[2], ma.get_transcript_sequence('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone( ma.get_transcript_sequence("ENST00000614237", type=EIdentifierTypes.ENSEMBL)) self.assertDictEqual( self.ENST00000361221, ma.get_transcript_information('ENST00000361221', type=EIdentifierTypes.ENSEMBL)) self.assertIsNone( ma.get_transcript_information("ENST00000614237", type=EIdentifierTypes.ENSEMBL)) self.assertEqual( str( ma.get_ensembl_ids_from_id('TP53', type=EIdentifierTypes.GENENAME)), "[{0: 'ENSG00000141510', 1: '-', 3: 'ENST00000413465', 4: 'ENSP00000410739'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000359597', 4: 'ENSP00000352610'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000504290', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000510385', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000504937', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000269305', 4: 'ENSP00000269305'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000455263', 4: 'ENSP00000398846'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000420246', 4: 'ENSP00000391127'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000445888', 4: 'ENSP00000391478'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000576024', 4: 'ENSP00000458393'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000509690', 4: 'ENSP00000425104'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000514944', 4: 'ENSP00000423862'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000574684', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000505014', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000508793', 4: 'ENSP00000424104'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000604348', 4: 'ENSP00000473895'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000503591', 4: 'ENSP00000426252'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t8', 4: 'LRG_321p8'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t7', 4: 'LRG_321p13'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t6', 4: 'LRG_321p12'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t5', 4: 'LRG_321p11'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t4', 4: 'LRG_321p10'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t3', 4: 'LRG_321p3'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t2', 4: 'LRG_321p2'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t1', 4: 'LRG_321p1'}]" )