def predict(self, df, mutation_window_size = None): """ Given a dataframe of mutated amino acid sequences, run each sequence through NetMHCpan. If mutation_window_size is not None then only make predictions for that number residues away from mutations. Expects the input DataFrame to have the following fields: - SourceSequence - MutationStart - MutationEnd - GeneInfo - Gene - GeneMutationInfo - PeptideMutationInfo - TranscriptId """ input_filename, peptide_entries = create_input_fasta_file( df, mutation_window_size=mutation_window_size ) alleles_str = \ ",".join(allele.replace("*", "") for allele in self.alleles) output_file = tempfile.NamedTemporaryFile( "r+", prefix="netMHCpan_output", delete=False) command = [ self.netmhc_command, "-xls", "-xlsfile", output_file.name, "-l", "9", "-f", input_filename, "-a", alleles_str] print " ".join(command) with CleanupFiles( filenames = [input_filename], files = [output_file]): run_command(command) results = parse_xls_file( output_file.read(), peptide_entries, mutation_window_size=mutation_window_size) assert len(results) > 0, "No epitopes from netMHCpan" return pd.DataFrame.from_records(results)
def predict(self, df, mutation_window_size=None): """ Given a dataframe of mutated amino acid sequences, run each sequence through NetMHCpan. If mutation_window_size is not None then only make predictions for that number residues away from mutations. Expects the input DataFrame to have the following fields: - SourceSequence - MutationStart - MutationEnd - GeneInfo - Gene - GeneMutationInfo - PeptideMutationInfo - TranscriptId """ input_filename, peptide_entries = create_input_fasta_file( df, mutation_window_size=mutation_window_size) alleles_str = \ ",".join(allele.replace("*", "") for allele in self.alleles) output_file = tempfile.NamedTemporaryFile("r+", prefix="netMHCpan_output", delete=False) command = [ self.netmhc_command, "-xls", "-xlsfile", output_file.name, "-l", "9", "-f", input_filename, "-a", alleles_str ] print " ".join(command) with CleanupFiles(filenames=[input_filename], files=[output_file]): run_command(command) results = parse_xls_file(output_file.read(), peptide_entries, mutation_window_size=mutation_window_size) assert len(results) > 0, "No epitopes from netMHCpan" return pd.DataFrame.from_records(results)
def predict(self, df, mutation_window_size = None): """ Given a dataframe of mutated amino acid sequences, run each sequence through NetMHCcons. If mutation_window_size is not None then only make predictions for that number residues away from mutations. Expects the input DataFrame to have the following fields: - SourceSequence - MutationStart - MutationEnd - GeneInfo - Gene - GeneMutationInfo - PeptideMutationInfo - TranscriptId """ input_filename, peptide_entries = create_input_fasta_file( df, mutation_window_size=mutation_window_size ) output_files = {} commands = {} dirs = [] for i, allele in enumerate(self.alleles): temp_dirname = tempfile.mkdtemp(prefix="tmp_netmhccons_") logging.info("Created temporary directory %s for allele %s", allele, temp_dirname ) dirs.append(temp_dirname) output_file = tempfile.NamedTemporaryFile( "w", prefix="netMHCcons_output_%d" % i, delete=False) command = [ self.netmhc_command, "-length", "9", "-f", input_filename, "-a", allele, '-tdir', temp_dirname] commands[output_file] = command results = [] # Cleanup either when finished or if an exception gets raised by # deleting the input and output files filenames_to_delete = [input_filename] for f in output_files.keys(): filenames_to_delete.append(f.name) with CleanupFiles( filenames = filenames_to_delete, directories = dirs): run_multiple_commands_redirect_stdout( commands, print_commands = True) for output_file, command in commands.iteritems(): # closing/opening looks insane # but I was getting empty files otherwise output_file.close() with open(output_file.name, 'r') as f: rows = parse_netmhc_stdout( f.read(), peptide_entries, mutation_window_size=mutation_window_size) results.extend(rows) assert len(results) > 0, "No epitopes from netMHCcons" df = pd.DataFrame.from_records(results) unique_alleles = set(df.Allele) assert len(unique_alleles) == len(self.alleles), \ "Expected %d alleles (%s) but got %d (%s)" % ( len(self.alleles), self.alleles, len(unique_alleles), unique_alleles ) return df