def parse_outputs(self, chunks): split_parsed_output_files = [] for (split_start, split_end) in chunks: tsv_chunk = "%d-%d" % (split_start, split_end) if self.input_file_type == 'fasta': fasta_chunk = tsv_chunk else: fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2) for a in self.alleles: for epl in self.epitope_lengths: split_iedb_output_files = [] status_message("Parsing binding predictions for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk)) for method in self.prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() if hasattr(prediction, 'iedb_prediction_method'): iedb_method = prediction.iedb_prediction_method else: iedb_method = method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: continue valid_lengths = prediction.valid_lengths_for_allele(a) if epl not in valid_lengths: continue split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(epl), "tsv_%s" % fasta_chunk])) if os.path.exists(split_iedb_out): split_iedb_output_files.append(split_iedb_out) split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, str(epl), "parsed", "tsv_%s" % fasta_chunk])) if os.path.exists(split_parsed_file_path): status_message("Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, epl, fasta_chunk)) split_parsed_output_files.append(split_parsed_file_path) continue if self.input_file_type == 'pvacvector_input_fasta': split_fasta_file_path = "{}_1-2.{}.tsv".format(self.split_fasta_basename(), epl) else: split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(), fasta_chunk) split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk)) split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk) params = { 'input_iedb_files' : split_iedb_output_files, 'input_tsv_file' : split_tsv_file_path, 'key_file' : split_fasta_key_file_path, 'output_file' : split_parsed_file_path, } if self.additional_report_columns and 'sample_name' in self.additional_report_columns: params['sample_name'] = self.sample_name else: params['sample_name'] = None parser = self.output_parser(params) parser.execute() status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s - Completed" % (a, epl, fasta_chunk)) split_parsed_output_files.append(split_parsed_file_path) return split_parsed_output_files
def call_iedb_and_parse_outputs(self, chunks): split_parsed_output_files = [] for (split_start, split_end) in chunks: tsv_chunk = "%d-%d" % (split_start, split_end) fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2) for a in self.alleles: split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(), fasta_chunk) split_iedb_output_files = [] status_message("Processing entries for Allele %s - Entries %s" % (a, fasta_chunk)) for method in self.prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() iedb_method = prediction.iedb_prediction_method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: status_message("Allele %s not valid for Method %s. Skipping." % (a, method)) continue split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, "tsv_%s" % fasta_chunk])) if os.path.exists(split_iedb_out): status_message("IEDB file for Allele %s with Method %s (Entries %s) already exists. Skipping." % (a, method, fasta_chunk)) split_iedb_output_files.append(split_iedb_out) continue status_message("Running IEDB on Allele %s with Method %s - Entries %s" % (a, method, fasta_chunk)) lib.call_iedb.main([ split_fasta_file_path, split_iedb_out, iedb_method, a, '-r', str(self.iedb_retries), '-e', self.iedb_executable, ]) status_message("Completed") split_iedb_output_files.append(split_iedb_out) split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, "parsed", "tsv_%s" % fasta_chunk])) if os.path.exists(split_parsed_file_path): status_message("Parsed Output File for Allele %s (Entries %s) already exists. Skipping" % (a, fasta_chunk)) split_parsed_output_files.append(split_parsed_file_path) continue split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: status_message("Parsing IEDB Output for Allele %s - Entries %s" % (a, fasta_chunk)) split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk) params = [ *split_iedb_output_files, split_tsv_file_path, split_fasta_key_file_path, split_parsed_file_path, '-m', self.top_score_metric, ] if self.top_result_per_mutation == True: params.append('-t') lib.parse_output.main(params) status_message("Completed") split_parsed_output_files.append(split_parsed_file_path) return split_parsed_output_files
def valid_alleles_per_algorithm(prediction_algorithms): valid_allele_list = {} for algorithm in prediction_algorithms.split(","): prediction_class = globals()[algorithm] alleles = prediction_class().valid_allele_names() # alleles sometimes returns as dict_keys instead of an array, so must specify as list valid_allele_list[algorithm] = list(alleles) return valid_allele_list
def valid_alleles(prediction_algorithms): valid_allele_list = {} for algorithm in prediction_algorithms.split(","): prediction_class = globals()[algorithm] alleles = prediction_class().valid_allele_names() # alleles sometimes returns as dict_keys instead of an array, so must specify as list valid_allele_list[algorithm] = list(alleles) return valid_allele_list
def main(args_input = sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) if args.prediction_algorithm is None: print('\n'.join(sorted(PredictionClass.all_valid_allele_names()))) else: prediction_class = globals()[args.prediction_algorithm] print("\n".join(sorted(prediction_class().valid_allele_names())))
def main(args_input=sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) if args.prediction_algorithm is None: print('\n'.join(sorted(PredictionClass.all_valid_allele_names()))) else: prediction_class = globals()[args.prediction_algorithm] print("\n".join(sorted(prediction_class().valid_allele_names())))
def main(args_input=sys.argv[1:]): parser = argparse.ArgumentParser( 'pvacseq call_iedb', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_file', type=argparse.FileType('r'), help="Input FASTA file") parser.add_argument('output_file', help="Output file from iedb") parser.add_argument('method', choices=PredictionClass.prediction_methods(), help="The iedb analysis method to use") parser.add_argument('allele', help="Allele for which to make prediction") parser.add_argument('-l', '--epitope-length', type=int, choices=[8, 9, 10, 11, 12, 13, 14, 15], help="Length of subpeptides (epitopes) to predict") parser.add_argument( "-r", "--iedb-retries", type=int, default=5, help= "Number of retries when making requests to the IEDB RESTful web interface. Must be less than or equal to 100." ) parser.add_argument("-e", "--iedb-executable-path", help="The executable path of the local IEDB install") args = parser.parse_args(args_input) PredictionClass.check_alleles_valid([args.allele]) prediction_class = getattr(sys.modules[__name__], args.method) prediction_class_object = prediction_class() prediction_class_object.check_allele_valid(args.allele) prediction_class_object.check_length_valid_for_allele( args.epitope_length, args.allele) if args.epitope_length is None and prediction_class_object.needs_epitope_length: sys.exit("Epitope length is required for class I binding predictions") (response_text, output_mode) = prediction_class_object.predict( args.input_file, args.allele, args.epitope_length, args.iedb_executable_path, args.iedb_retries) tmp_output_file = args.output_file + '.tmp' if output_mode == 'pandas': response_text.to_csv(tmp_output_file, index=False, sep="\t") else: tmp_output_filehandle = open(tmp_output_file, output_mode) tmp_output_filehandle.write(response_text) tmp_output_filehandle.close() os.replace(tmp_output_file, args.output_file) args.input_file.close()
def main(args_input=sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) if "." in args.sample_name: sys.exit("Sample name cannot contain '.'") if args.fasta_size % 2 != 0: sys.exit("The fasta size needs to be an even number") if args.iedb_retries > 100: sys.exit( "The number of IEDB retries must be less than or equal to 100") if args.downstream_sequence_length == 'full': downstream_sequence_length = None elif args.downstream_sequence_length.isdigit(): downstream_sequence_length = int(args.downstream_sequence_length) else: sys.exit( "The downstream sequence length needs to be a positive integer or 'full'" ) # if args.iedb_install_directory: # lib.call_iedb.setup_iedb_conda_env() input_file_type = 'bedpe' base_output_dir = os.path.abspath(args.output_dir) class_i_prediction_algorithms = [] class_ii_prediction_algorithms = [] for prediction_algorithm in sorted(args.prediction_algorithms): prediction_class = globals()[prediction_algorithm] prediction_class_object = prediction_class() if isinstance(prediction_class_object, MHCI): class_i_prediction_algorithms.append(prediction_algorithm) elif isinstance(prediction_class_object, MHCII): class_ii_prediction_algorithms.append(prediction_algorithm) class_i_alleles = [] class_ii_alleles = [] for allele in sorted(set(args.allele)): valid = 0 if allele in MHCI.all_valid_allele_names(): class_i_alleles.append(allele) valid = 1 if allele in MHCII.all_valid_allele_names(): class_ii_alleles.append(allele) valid = 1 if not valid: print("Allele %s not valid. Skipping." % allele) shared_arguments = { 'input_file': args.input_file, 'input_file_type': input_file_type, 'sample_name': args.sample_name, 'top_score_metric': args.top_score_metric, 'binding_threshold': args.binding_threshold, 'allele_specific_cutoffs': args.allele_specific_binding_thresholds, 'net_chop_method': args.net_chop_method, 'net_chop_threshold': args.net_chop_threshold, 'additional_report_columns': args.additional_report_columns, 'fasta_size': args.fasta_size, 'iedb_retries': args.iedb_retries, 'downstream_sequence_length': downstream_sequence_length, 'keep_tmp_files': args.keep_tmp_files, 'n_threads': args.n_threads, } if len(class_i_prediction_algorithms) > 0 and len(class_i_alleles) > 0: if args.epitope_length is None: sys.exit( "Epitope length is required for class I binding predictions") if args.iedb_install_directory: iedb_mhc_i_executable = os.path.join(args.iedb_install_directory, 'mhc_i', 'src', 'predict_binding.py') if not os.path.exists(iedb_mhc_i_executable): sys.exit("IEDB MHC I executable path doesn't exist %s" % iedb_mhc_i_executable) else: iedb_mhc_i_executable = None print("Executing MHC Class I predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_I') os.makedirs(output_dir, exist_ok=True) class_i_arguments = shared_arguments.copy() class_i_arguments['alleles'] = class_i_alleles class_i_arguments[ 'peptide_sequence_length'] = args.peptide_sequence_length class_i_arguments['iedb_executable'] = iedb_mhc_i_executable class_i_arguments['epitope_lengths'] = args.epitope_length class_i_arguments[ 'prediction_algorithms'] = class_i_prediction_algorithms class_i_arguments['output_dir'] = output_dir class_i_arguments['netmhc_stab'] = args.netmhc_stab pipeline = Pipeline(**class_i_arguments) pipeline.execute() elif len(class_i_prediction_algorithms) == 0: print( "No MHC class I prediction algorithms chosen. Skipping MHC class I predictions." ) elif len(class_i_alleles) == 0: print( "No MHC class I alleles chosen. Skipping MHC class I predictions.") if len(class_ii_prediction_algorithms) > 0 and len(class_ii_alleles) > 0: if args.iedb_install_directory: iedb_mhc_ii_executable = os.path.join(args.iedb_install_directory, 'mhc_ii', 'mhc_II_binding.py') if not os.path.exists(iedb_mhc_ii_executable): sys.exit("IEDB MHC II executable path doesn't exist %s" % iedb_mhc_ii_executable) else: iedb_mhc_ii_executable = None print("Executing MHC Class II predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_II') os.makedirs(output_dir, exist_ok=True) class_ii_arguments = shared_arguments.copy() class_ii_arguments['alleles'] = class_ii_alleles class_ii_arguments[ 'prediction_algorithms'] = class_ii_prediction_algorithms class_ii_arguments['peptide_sequence_length'] = 31 class_ii_arguments['iedb_executable'] = iedb_mhc_ii_executable class_ii_arguments['epitope_lengths'] = [15] class_ii_arguments['output_dir'] = output_dir class_ii_arguments['netmhc_stab'] = False pipeline = Pipeline(**class_ii_arguments) pipeline.execute() elif len(class_ii_prediction_algorithms) == 0: print( "No MHC class II prediction algorithms chosen. Skipping MHC class II predictions." ) elif len(class_ii_alleles) == 0: print( "No MHC class II alleles chosen. Skipping MHC class II predictions." ) if len(class_i_prediction_algorithms) > 0 and len( class_i_alleles) > 0 and len(class_ii_prediction_algorithms ) > 0 and len(class_ii_alleles) > 0: print("Creating combined reports") create_combined_reports(base_output_dir, args)
def call_iedb_and_parse_outputs(self, chunks): split_parsed_output_files = [] for (split_start, split_end) in chunks: tsv_chunk = "%d-%d" % (split_start, split_end) fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2) for a in self.alleles: split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(), fasta_chunk) split_iedb_output_files = [] status_message("Processing entries for Allele %s - Entries %s" % (a, fasta_chunk)) if os.path.getsize(split_fasta_file_path) == 0: status_message("Fasta file is empty. Skipping") continue for method in self.prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() if hasattr(prediction, 'iedb_prediction_method'): iedb_method = prediction.iedb_prediction_method else: iedb_method = method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: status_message("Allele %s not valid for Method %s. Skipping." % (a, method)) continue split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, "tsv_%s" % fasta_chunk])) if os.path.exists(split_iedb_out): status_message("IEDB file for Allele %s with Method %s (Entries %s) already exists. Skipping." % (a, method, fasta_chunk)) split_iedb_output_files.append(split_iedb_out) continue status_message("Running IEDB on Allele %s with Method %s - Entries %s" % (a, method, fasta_chunk)) if not os.environ.get('TEST_FLAG') or os.environ.get('TEST_FLAG') == '0': if 'last_execute_timestamp' in locals() and not self.iedb_executable: elapsed_time = ( datetime.datetime.now() - last_execute_timestamp ).total_seconds() wait_time = 60 - elapsed_time if wait_time > 0: time.sleep(wait_time) lib.call_iedb.main([ split_fasta_file_path, split_iedb_out, method, a, '-r', str(self.iedb_retries), '-e', self.iedb_executable, ]) last_execute_timestamp = datetime.datetime.now() status_message("Completed") split_iedb_output_files.append(split_iedb_out) split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, "parsed", "tsv_%s" % fasta_chunk])) if os.path.exists(split_parsed_file_path): status_message("Parsed Output File for Allele %s (Entries %s) already exists. Skipping" % (a, fasta_chunk)) split_parsed_output_files.append(split_parsed_file_path) continue split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: status_message("Parsing IEDB Output for Allele %s - Entries %s" % (a, fasta_chunk)) split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk) params = { 'input_iedb_files' : split_iedb_output_files, 'input_tsv_file' : split_tsv_file_path, 'key_file' : split_fasta_key_file_path, 'output_file' : split_parsed_file_path, } if self.additional_report_columns and 'sample_name' in self.additional_report_columns: params['sample_name'] = self.sample_name else: params['sample_name'] = None parser = self.output_parser(params) parser.execute() status_message("Completed") split_parsed_output_files.append(split_parsed_file_path) return split_parsed_output_files
def call_iedb_and_parse_outputs(self, chunks): split_parsed_output_files = [] for chunk in chunks: for a in self.alleles: split_fasta_file_path = "%s_%s" % (self.split_fasta_basename(), chunk) split_iedb_output_files = [] print("Processing entries for Allele %s - Entries %s" % (a, chunk)) for method in self.prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() iedb_method = prediction.iedb_prediction_method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: print("Allele %s not valid for Method %s. Skipping." % (a, method)) continue split_iedb_out = os.path.join( self.tmp_dir, ".".join([ self.sample_name, iedb_method, a, "tsv_%s" % chunk ])) if os.path.exists(split_iedb_out): print( "IEDB file for Allele %s with Method %s (Entries %s) already exists. Skipping." % (a, method, chunk)) split_iedb_output_files.append(split_iedb_out) continue print( "Running IEDB on Allele %s with Method %s - Entries %s" % (a, method, chunk)) lib.call_iedb.main([ split_fasta_file_path, split_iedb_out, iedb_method, a, ]) print("Completed") split_iedb_output_files.append(split_iedb_out) split_parsed_file_path = os.path.join( self.tmp_dir, ".".join([self.sample_name, a, "parsed", "tsv_%s" % chunk])) if os.path.exists(split_parsed_file_path): print( "Parsed Output File for Allele %s (Entries %s) already exists. Skipping" % (a, chunk)) split_parsed_output_files.append(split_parsed_file_path) continue split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: print("Parsing IEDB Output for Allele %s - Entries %s" % (a, chunk)) params = [ *split_iedb_output_files, self.tsv_file_path(), split_fasta_key_file_path, split_parsed_file_path, '-m', self.top_score_metric, ] if self.top_result_per_mutation == True: params.append('-t') lib.parse_output.main(params) print("Completed") split_parsed_output_files.append(split_parsed_file_path) return split_parsed_output_files
def call_iedb_and_parse_outputs(self, chunks): split_parsed_output_files = [] for (split_start, split_end) in chunks: tsv_chunk = "%d-%d" % (split_start, split_end) fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2) for a in self.alleles: for epl in self.epitope_lengths: split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(), fasta_chunk) split_iedb_output_files = [] status_message("Processing entries for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk)) for method in self.prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() iedb_method = prediction.iedb_prediction_method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: status_message("Allele %s not valid for Method %s. Skipping." % (a, method)) continue valid_lengths = prediction.valid_lengths_for_allele(a) if epl not in valid_lengths: status_message("Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (epl, method, a)) continue split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(epl), "tsv_%s" % fasta_chunk])) if os.path.exists(split_iedb_out): status_message("IEDB file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, epl, method, fasta_chunk)) split_iedb_output_files.append(split_iedb_out) continue status_message("Running IEDB on Allele %s and Epitope Length %s with Method %s - Entries %s" % (a, epl, method, fasta_chunk)) if not os.environ.get('TEST_FLAG') or os.environ.get('TEST_FLAG') == '0': if 'last_execute_timestamp' in locals() and not self.iedb_executable: elapsed_time = ( datetime.datetime.now() - last_execute_timestamp ).total_seconds() wait_time = 60 - elapsed_time if wait_time > 0: time.sleep(wait_time) lib.call_iedb.main([ split_fasta_file_path, split_iedb_out, iedb_method, a, '-l', str(epl), '-r', str(self.iedb_retries), '-e', self.iedb_executable, ]) last_execute_timestamp = datetime.datetime.now() status_message("Completed") split_iedb_output_files.append(split_iedb_out) split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, str(epl), "parsed", "tsv_%s" % fasta_chunk])) if os.path.exists(split_parsed_file_path): status_message("Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, epl, fasta_chunk)) split_parsed_output_files.append(split_parsed_file_path) continue split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: status_message("Parsing IEDB Output for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk)) split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk) params = [ *split_iedb_output_files, split_tsv_file_path, split_fasta_key_file_path, split_parsed_file_path, '-m', self.top_score_metric, ] if self.top_result_per_mutation == True: params.append('-t') lib.parse_output.main(params) status_message("Completed") split_parsed_output_files.append(split_parsed_file_path) return split_parsed_output_files
def call_iedb(self, chunks, length): alleles = self.alleles prediction_algorithms = self.prediction_algorithms argument_sets = [] warning_messages = [] for (split_start, split_end) in chunks: tsv_chunk = "%d-%d" % (split_start, split_end) if self.input_file_type == 'fasta': fasta_chunk = tsv_chunk else: fasta_chunk = "%d-%d" % (split_start * 2 - 1, split_end * 2) for a in alleles: split_fasta_file_path = "%s_%s" % ( self.split_fasta_basename(length), fasta_chunk) if os.path.getsize(split_fasta_file_path) == 0: msg = "Fasta file {} is empty. Skipping".format( split_fasta_file_path) if msg not in warning_messages: warning_messages.append(msg) continue #begin of per-algorithm processing for method in prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() if hasattr(prediction, 'iedb_prediction_method'): iedb_method = prediction.iedb_prediction_method else: iedb_method = method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: msg = "Allele %s not valid for Method %s. Skipping." % ( a, method) if msg not in warning_messages: warning_messages.append(msg) continue valid_lengths = prediction.valid_lengths_for_allele(a) if length not in valid_lengths: msg = "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % ( length, method, a) if msg not in warning_messages: warning_messages.append(msg) continue split_iedb_out = os.path.join( self.tmp_dir, ".".join([ self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk ])) if os.path.exists(split_iedb_out): msg = "Prediction file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % ( a, length, method, fasta_chunk) if msg not in warning_messages: warning_messages.append(msg) continue arguments = [ split_fasta_file_path, split_iedb_out, method, a, '-r', str(self.iedb_retries), '-e', self.iedb_executable, ] if not isinstance(prediction, IEDBMHCII): arguments.extend([ '-l', str(length), ]) argument_sets.append(arguments) for msg in warning_messages: status_message(msg) with pymp.Parallel(self.n_threads) as p: for index in p.range(len(argument_sets)): arguments = argument_sets[index] a = arguments[3] method = arguments[2] filename = arguments[1] if len(arguments) == 10: epl = arguments[9] else: epl = 15 p.print( "Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s" % (a, epl, method, filename)) lib.call_iedb.main(arguments) p.print( "Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s - Completed" % (a, epl, method, filename))
def call_iedb_and_parse_outputs(self, chunks): pymp.config.nested = True alleles = self.alleles epitope_lengths = self.epitope_lengths prediction_algorithms = self.prediction_algorithms iteration_info = { 'file': { 'total_iterations': len(chunks), 'iterations_per_thread': len(chunks), 'threads': 1, }, 'allele': { 'total_iterations': len(alleles), 'iterations_per_thread': len(alleles), 'threads': 1, }, 'length': { 'total_iterations': len(epitope_lengths), 'iterations_per_thread': len(epitope_lengths), 'threads': 1, }, 'algorithm': { 'total_iterations': len(prediction_algorithms), 'iterations_per_thread': len(prediction_algorithms), 'threads': 1, }, } iteration_info = self.balance_multithreads(iteration_info) split_parsed_output_files = [] lock = Lock() with pymp.Parallel(iteration_info['file']['threads']) as p: for i in p.range(len(chunks)): (split_start, split_end) = chunks[i] tsv_chunk = "%d-%d" % (split_start, split_end) fasta_chunk = "%d-%d" % (split_start * 2 - 1, split_end * 2) with pymp.Parallel(iteration_info['allele']['threads']) as p2: for j in p2.range(len(alleles)): a = alleles[j] with pymp.Parallel( iteration_info['length']['threads']) as p3: for k in p3.range(len(epitope_lengths)): epl = epitope_lengths[k] if self.input_file_type == 'pvacvector_input_fasta': split_fasta_file_path = "{}_1-2.{}.tsv".format( self.split_fasta_basename(), epl) else: split_fasta_file_path = "%s_%s" % ( self.split_fasta_basename(), fasta_chunk) split_iedb_output_files = [] status_message_with_lock( "Processing entries for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk), lock) if os.path.getsize(split_fasta_file_path) == 0: status_message_with_lock( "Fasta file is empty. Skipping", lock) continue #begin of per-algorithm processing with pymp.Parallel(iteration_info['algorithm'] ['threads']) as p4: for m in p4.range( len(prediction_algorithms)): method = prediction_algorithms[m] prediction_class = globals()[method] prediction = prediction_class() if hasattr(prediction, 'iedb_prediction_method'): iedb_method = prediction.iedb_prediction_method else: iedb_method = method valid_alleles = prediction.valid_allele_names( ) if a not in valid_alleles: status_message_with_lock( "Allele %s not valid for Method %s. Skipping." % (a, method), lock) continue valid_lengths = prediction.valid_lengths_for_allele( a) if epl not in valid_lengths: status_message_with_lock( "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (epl, method, a), lock) continue split_iedb_out = os.path.join( self.tmp_dir, ".".join([ self.sample_name, iedb_method, a, str(epl), "tsv_%s" % fasta_chunk ])) if os.path.exists(split_iedb_out): status_message_with_lock( "IEDB file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, epl, method, fasta_chunk), lock) split_iedb_output_files.append( split_iedb_out) continue status_message_with_lock( "Running IEDB on Allele %s and Epitope Length %s with Method %s - Entries %s" % (a, epl, method, fasta_chunk), lock) if not os.environ.get( 'TEST_FLAG') or os.environ.get( 'TEST_FLAG') == '0': if 'last_execute_timestamp' in locals( ) and not self.iedb_executable: elapsed_time = ( datetime.datetime.now() - last_execute_timestamp ).total_seconds() wait_time = 60 - elapsed_time if wait_time > 0: time.sleep(wait_time) arguments = [ split_fasta_file_path, split_iedb_out, method, a, '-r', str(self.iedb_retries), '-e', self.iedb_executable, ] if not isinstance( prediction, IEDBMHCII): arguments.extend([ '-l', str(epl), ]) lib.call_iedb.main(arguments) last_execute_timestamp = datetime.datetime.now( ) status_message_with_lock( "Running IEDB on Allele %s and Epitope Length %s with Method %s - Entries %s - Completed" % (a, epl, method, fasta_chunk), lock) split_iedb_output_files.append( split_iedb_out) #end of per-algorithm processing #parse all output files for one allele, epitope, and file chunk over all algorithms into one file split_parsed_file_path = os.path.join( self.tmp_dir, ".".join([ self.sample_name, a, str(epl), "parsed", "tsv_%s" % fasta_chunk ])) if os.path.exists(split_parsed_file_path): status_message_with_lock( "Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, epl, fasta_chunk), lock) split_parsed_output_files.append( split_parsed_file_path) continue split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: status_message_with_lock( "Parsing IEDB Output for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk), lock) split_tsv_file_path = "%s_%s" % ( self.tsv_file_path(), tsv_chunk) params = { 'input_iedb_files': split_iedb_output_files, 'input_tsv_file': split_tsv_file_path, 'key_file': split_fasta_key_file_path, 'output_file': split_parsed_file_path, } if self.additional_report_columns and 'sample_name' in self.additional_report_columns: params[ 'sample_name'] = self.sample_name else: params['sample_name'] = None parser = self.output_parser(params) parser.execute() status_message_with_lock( "Parsing IEDB Output for Allele %s and Epitope Length %s - Entries %s - Completed" % (a, epl, fasta_chunk), lock) split_parsed_output_files.append( split_parsed_file_path) return split_parsed_output_files
def main(args_input=sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) PredictionClass.check_alleles_valid(args.allele) if "." in args.sample_name: sys.exit("Sample name cannot contain '.'") if args.fasta_size % 2 != 0: sys.exit("The fasta size needs to be an even number") if args.downstream_sequence_length == 'full': downstream_sequence_length = None elif args.downstream_sequence_length.isdigit(): downstream_sequence_length = args.downstream_sequence_length else: sys.exit( "The downstream sequence length needs to be a positive integer or 'full'" ) base_output_dir = os.path.abspath(args.output_dir) class_i_prediction_algorithms = [] class_ii_prediction_algorithms = [] for prediction_algorithm in args.prediction_algorithms: prediction_class = globals()[prediction_algorithm] prediction_class_object = prediction_class() if isinstance(prediction_class_object, MHCI): class_i_prediction_algorithms.append(prediction_algorithm) elif isinstance(prediction_class_object, MHCII): class_ii_prediction_algorithms.append(prediction_algorithm) shared_arguments = { 'input_file': args.input_file, 'sample_name': args.sample_name, 'alleles': args.allele, 'top_result_per_mutation': args.top_result_per_mutation, 'top_score_metric': args.top_score_metric, 'binding_threshold': args.binding_threshold, 'minimum_fold_change': args.minimum_fold_change, 'net_chop_method': args.net_chop_method, 'net_chop_threshold': args.net_chop_threshold, 'normal_cov': args.normal_cov, 'normal_vaf': args.normal_vaf, 'tdna_cov': args.tdna_cov, 'tdna_vaf': args.tdna_vaf, 'trna_cov': args.trna_cov, 'trna_vaf': args.trna_vaf, 'expn_val': args.expn_val, 'fasta_size': args.fasta_size, 'downstream_sequence_length': downstream_sequence_length, 'keep_tmp_files': args.keep_tmp_files, } additional_input_files = parse_additional_input_file_list( args.additional_input_file_list) shared_arguments.update(additional_input_files) if len(class_i_prediction_algorithms) > 0: if args.epitope_length is None: sys.exit( "Epitope length is required for class I binding predictions") print("Executing MHC Class I predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_I') os.makedirs(output_dir, exist_ok=True) class_i_arguments = shared_arguments.copy() class_i_arguments[ 'peptide_sequence_length'] = args.peptide_sequence_length class_i_arguments['epitope_lengths'] = args.epitope_length class_i_arguments[ 'prediction_algorithms'] = class_i_prediction_algorithms class_i_arguments['output_dir'] = output_dir class_i_arguments['netmhc_stab'] = args.netmhc_stab pipeline = MHCIPipeline(**class_i_arguments) pipeline.execute() if len(class_ii_prediction_algorithms) > 0: print("Executing MHC Class II predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_II') os.makedirs(output_dir, exist_ok=True) class_ii_arguments = shared_arguments.copy() class_ii_arguments[ 'prediction_algorithms'] = class_ii_prediction_algorithms class_ii_arguments['output_dir'] = output_dir class_ii_arguments['netmhc_stab'] = False pipeline = MHCIIPipeline(**class_ii_arguments) pipeline.execute()
def print_valid_alleles(self): if self.prediction_algorithm is None: print('\n'.join(sorted(PredictionClass.all_valid_allele_names()))) else: prediction_class = globals()[self.prediction_algorithm] print("\n".join(sorted(prediction_class().valid_allele_names())))
def main(args_input = sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) PredictionClass.check_alleles_valid(args.allele) if "." in args.sample_name: sys.exit("Sample name cannot contain '.'") if args.fasta_size%2 != 0: sys.exit("The fasta size needs to be an even number") if args.iedb_retries > 100: sys.exit("The number of IEDB retries must be less than or equal to 100") if args.downstream_sequence_length == 'full': downstream_sequence_length = None elif args.downstream_sequence_length.isdigit(): downstream_sequence_length = args.downstream_sequence_length else: sys.exit("The downstream sequence length needs to be a positive integer or 'full'") base_output_dir = os.path.abspath(args.output_dir) class_i_prediction_algorithms = [] class_ii_prediction_algorithms = [] for prediction_algorithm in sorted(args.prediction_algorithms): prediction_class = globals()[prediction_algorithm] prediction_class_object = prediction_class() if isinstance(prediction_class_object, MHCI): class_i_prediction_algorithms.append(prediction_algorithm) elif isinstance(prediction_class_object, MHCII): class_ii_prediction_algorithms.append(prediction_algorithm) class_i_alleles = [] class_ii_alleles = [] for allele in sorted(set(args.allele)): if allele in MHCI.all_valid_allele_names(): class_i_alleles.append(allele) if allele in MHCII.all_valid_allele_names(): class_ii_alleles.append(allele) shared_arguments = { 'input_file' : args.input_file, 'sample_name' : args.sample_name, 'top_result_per_mutation' : args.top_result_per_mutation, 'top_score_metric' : args.top_score_metric, 'binding_threshold' : args.binding_threshold, 'minimum_fold_change' : args.minimum_fold_change, 'net_chop_method' : args.net_chop_method, 'net_chop_threshold' : args.net_chop_threshold, 'normal_cov' : args.normal_cov, 'normal_vaf' : args.normal_vaf, 'tdna_cov' : args.tdna_cov, 'tdna_vaf' : args.tdna_vaf, 'trna_cov' : args.trna_cov, 'trna_vaf' : args.trna_vaf, 'expn_val' : args.expn_val, 'fasta_size' : args.fasta_size, 'iedb_retries' : args.iedb_retries, 'downstream_sequence_length': downstream_sequence_length, 'keep_tmp_files' : args.keep_tmp_files, } additional_input_files = parse_additional_input_file_list(args.additional_input_file_list) shared_arguments.update(additional_input_files) if len(class_i_prediction_algorithms) > 0 and len(class_i_alleles) > 0: if args.epitope_length is None: sys.exit("Epitope length is required for class I binding predictions") if args.iedb_install_directory: iedb_mhc_i_executable = os.path.join(args.iedb_install_directory, 'mhc_i', 'src', 'predict_binding.py') if not os.path.exists(iedb_mhc_i_executable): sys.exit("IEDB MHC I executable path doesn't exist %s" % iedb_mhc_i_executable) else: iedb_mhc_i_executable = None print("Executing MHC Class I predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_I') os.makedirs(output_dir, exist_ok=True) class_i_arguments = shared_arguments.copy() class_i_arguments['alleles'] = class_i_alleles class_i_arguments['peptide_sequence_length'] = args.peptide_sequence_length class_i_arguments['iedb_executable'] = iedb_mhc_i_executable class_i_arguments['epitope_lengths'] = args.epitope_length class_i_arguments['prediction_algorithms'] = class_i_prediction_algorithms class_i_arguments['output_dir'] = output_dir class_i_arguments['netmhc_stab'] = args.netmhc_stab pipeline = MHCIPipeline(**class_i_arguments) pipeline.execute() if len(class_ii_prediction_algorithms) > 0 and len(class_ii_alleles) > 0: if args.iedb_install_directory: iedb_mhc_ii_executable = os.path.join(args.iedb_install_directory, 'mhc_ii', 'mhc_II_binding.py') if not os.path.exists(iedb_mhc_ii_executable): sys.exit("IEDB MHC II executable path doesn't exist %s" % iedb_mhc_ii_executable) else: iedb_mhc_ii_executable = None print("Executing MHC Class II predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_II') os.makedirs(output_dir, exist_ok=True) class_ii_arguments = shared_arguments.copy() class_ii_arguments['alleles'] = class_ii_alleles class_ii_arguments['prediction_algorithms'] = class_ii_prediction_algorithms class_ii_arguments['iedb_executable'] = iedb_mhc_ii_executable class_ii_arguments['output_dir'] = output_dir class_ii_arguments['netmhc_stab'] = False pipeline = MHCIIPipeline(**class_ii_arguments) pipeline.execute()
def main(args_input=sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) if args.input_file.endswith('.vcf'): input_file_type = 'vcf' elif args.input_file.endswith('.bedpe'): input_file_type = 'bedpe' else: sys.exit( "Unknown input file type for file (%s). Input file must be either a VCF (.vcf) or a bedpe (.bedpe) file." % input_file) if "." in args.sample_name: sys.exit("Sample name cannot contain '.'") if args.fasta_size % 2 != 0: sys.exit("The fasta size needs to be an even number") if args.iedb_retries > 100: sys.exit( "The number of IEDB retries must be less than or equal to 100") if args.downstream_sequence_length == 'full': downstream_sequence_length = None elif args.downstream_sequence_length.isdigit(): downstream_sequence_length = int(args.downstream_sequence_length) else: sys.exit( "The downstream sequence length needs to be a positive integer or 'full'" ) base_output_dir = os.path.abspath(args.output_dir) class_i_prediction_algorithms = [] class_ii_prediction_algorithms = [] for prediction_algorithm in sorted(args.prediction_algorithms): prediction_class = globals()[prediction_algorithm] prediction_class_object = prediction_class() if isinstance(prediction_class_object, MHCI): class_i_prediction_algorithms.append(prediction_algorithm) elif isinstance(prediction_class_object, MHCII): class_ii_prediction_algorithms.append(prediction_algorithm) class_i_alleles = [] class_ii_alleles = [] for allele in sorted(set(args.allele)): valid = 0 if allele in MHCI.all_valid_allele_names(): class_i_alleles.append(allele) valid = 1 if allele in MHCII.all_valid_allele_names(): class_ii_alleles.append(allele) valid = 1 if not valid: print("Allele %s not valid. Skipping." % allele) shared_arguments = { 'input_file': args.input_file, 'input_file_type': input_file_type, 'sample_name': args.sample_name, 'top_result_per_mutation': args.top_result_per_mutation, 'top_score_metric': args.top_score_metric, 'binding_threshold': args.binding_threshold, 'minimum_fold_change': args.minimum_fold_change, 'net_chop_method': args.net_chop_method, 'net_chop_threshold': args.net_chop_threshold, 'normal_cov': args.normal_cov, 'normal_vaf': args.normal_vaf, 'tdna_cov': args.tdna_cov, 'tdna_vaf': args.tdna_vaf, 'trna_cov': args.trna_cov, 'trna_vaf': args.trna_vaf, 'expn_val': args.expn_val, 'additional_report_columns': args.additional_report_columns, 'fasta_size': args.fasta_size, 'iedb_retries': args.iedb_retries, 'downstream_sequence_length': downstream_sequence_length, 'keep_tmp_files': args.keep_tmp_files, } additional_input_files = parse_additional_input_file_list( args.additional_input_file_list) shared_arguments.update(additional_input_files) if len(class_i_prediction_algorithms) > 0 and len(class_i_alleles) > 0: if args.epitope_length is None: sys.exit( "Epitope length is required for class I binding predictions") if args.iedb_install_directory: iedb_mhc_i_executable = os.path.join(args.iedb_install_directory, 'mhc_i', 'src', 'predict_binding.py') if not os.path.exists(iedb_mhc_i_executable): sys.exit("IEDB MHC I executable path doesn't exist %s" % iedb_mhc_i_executable) else: iedb_mhc_i_executable = None print("Executing MHC Class I predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_I') os.makedirs(output_dir, exist_ok=True) class_i_arguments = shared_arguments.copy() class_i_arguments['alleles'] = class_i_alleles class_i_arguments[ 'peptide_sequence_length'] = args.peptide_sequence_length class_i_arguments['iedb_executable'] = iedb_mhc_i_executable class_i_arguments['epitope_lengths'] = args.epitope_length class_i_arguments[ 'prediction_algorithms'] = class_i_prediction_algorithms class_i_arguments['output_dir'] = output_dir class_i_arguments['netmhc_stab'] = args.netmhc_stab pipeline = MHCIPipeline(**class_i_arguments) pipeline.execute() if len(class_ii_prediction_algorithms) > 0 and len(class_ii_alleles) > 0: if args.iedb_install_directory: iedb_mhc_ii_executable = os.path.join(args.iedb_install_directory, 'mhc_ii', 'mhc_II_binding.py') if not os.path.exists(iedb_mhc_ii_executable): sys.exit("IEDB MHC II executable path doesn't exist %s" % iedb_mhc_ii_executable) else: iedb_mhc_ii_executable = None print("Executing MHC Class II predictions") output_dir = os.path.join(base_output_dir, 'MHC_Class_II') os.makedirs(output_dir, exist_ok=True) class_ii_arguments = shared_arguments.copy() class_ii_arguments['alleles'] = class_ii_alleles class_ii_arguments[ 'prediction_algorithms'] = class_ii_prediction_algorithms class_ii_arguments['iedb_executable'] = iedb_mhc_ii_executable class_ii_arguments['output_dir'] = output_dir class_ii_arguments['netmhc_stab'] = False pipeline = MHCIIPipeline(**class_ii_arguments) pipeline.execute()
def call_iedb_and_parse_outputs(self, chunks): split_parsed_output_files = [] for (split_start, split_end) in chunks: tsv_chunk = "%d-%d" % (split_start, split_end) fasta_chunk = "%d-%d" % (split_start * 2 - 1, split_end * 2) for a in self.alleles: for epl in self.epitope_lengths: split_fasta_file_path = "%s_%s" % ( self.split_fasta_basename(), fasta_chunk) split_iedb_output_files = [] status_message( "Processing entries for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk)) for method in self.prediction_algorithms: prediction_class = globals()[method] prediction = prediction_class() iedb_method = prediction.iedb_prediction_method valid_alleles = prediction.valid_allele_names() if a not in valid_alleles: status_message( "Allele %s not valid for Method %s. Skipping." % (a, method)) continue valid_lengths = prediction.valid_lengths_for_allele(a) if epl not in valid_lengths: status_message( "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (epl, method, a)) continue split_iedb_out = os.path.join( self.tmp_dir, ".".join([ self.sample_name, iedb_method, a, str(epl), "tsv_%s" % fasta_chunk ])) if os.path.exists(split_iedb_out): status_message( "IEDB file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, epl, method, fasta_chunk)) split_iedb_output_files.append(split_iedb_out) continue status_message( "Running IEDB on Allele %s and Epitope Length %s with Method %s - Entries %s" % (a, epl, method, fasta_chunk)) lib.call_iedb.main([ split_fasta_file_path, split_iedb_out, iedb_method, a, '-l', str(epl), '-r', str(self.iedb_retries), '-e', self.iedb_executable, ]) status_message("Completed") split_iedb_output_files.append(split_iedb_out) split_parsed_file_path = os.path.join( self.tmp_dir, ".".join([ self.sample_name, a, str(epl), "parsed", "tsv_%s" % fasta_chunk ])) if os.path.exists(split_parsed_file_path): status_message( "Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, epl, fasta_chunk)) split_parsed_output_files.append( split_parsed_file_path) continue split_fasta_key_file_path = split_fasta_file_path + '.key' if len(split_iedb_output_files) > 0: status_message( "Parsing IEDB Output for Allele %s and Epitope Length %s - Entries %s" % (a, epl, fasta_chunk)) split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk) params = [ *split_iedb_output_files, split_tsv_file_path, split_fasta_key_file_path, split_parsed_file_path, '-m', self.top_score_metric, ] if self.top_result_per_mutation == True: params.append('-t') lib.parse_output.main(params) status_message("Completed") split_parsed_output_files.append( split_parsed_file_path) return split_parsed_output_files