Ejemplo n.º 1
0
    def predict(self, df, mutation_window_size = None):
        """
        Given a dataframe of mutated amino acid sequences, run each sequence
        through NetMHCpan.
        If mutation_window_size is not None then only make predictions for that
        number residues away from mutations.

        Expects the input DataFrame to have the following fields:
            - SourceSequence
            - MutationStart
            - MutationEnd
            - GeneInfo
            - Gene
            - GeneMutationInfo
            - PeptideMutationInfo
            - TranscriptId
        """

        input_filename, peptide_entries = create_input_fasta_file(
            df,
            mutation_window_size=mutation_window_size
        )

        alleles_str = \
            ",".join(allele.replace("*", "") for allele in self.alleles)
        output_file =  tempfile.NamedTemporaryFile(
                "r+",
                prefix="netMHCpan_output",
                delete=False)
        command = [
            self.netmhc_command,
                "-xls",
                "-xlsfile", output_file.name,
                 "-l", "9",
                  "-f", input_filename,
                  "-a", alleles_str]
        print " ".join(command)

        with CleanupFiles(
                filenames = [input_filename],
                files = [output_file]):
            run_command(command)
            results = parse_xls_file(
                output_file.read(),
                peptide_entries,
                mutation_window_size=mutation_window_size)

        assert len(results) > 0, "No epitopes from netMHCpan"
        return pd.DataFrame.from_records(results)
Ejemplo n.º 2
0
    def predict(self, df, mutation_window_size=None):
        """
        Given a dataframe of mutated amino acid sequences, run each sequence
        through NetMHCpan.
        If mutation_window_size is not None then only make predictions for that
        number residues away from mutations.

        Expects the input DataFrame to have the following fields:
            - SourceSequence
            - MutationStart
            - MutationEnd
            - GeneInfo
            - Gene
            - GeneMutationInfo
            - PeptideMutationInfo
            - TranscriptId
        """

        input_filename, peptide_entries = create_input_fasta_file(
            df, mutation_window_size=mutation_window_size)

        alleles_str = \
            ",".join(allele.replace("*", "") for allele in self.alleles)
        output_file = tempfile.NamedTemporaryFile("r+",
                                                  prefix="netMHCpan_output",
                                                  delete=False)
        command = [
            self.netmhc_command, "-xls", "-xlsfile", output_file.name, "-l",
            "9", "-f", input_filename, "-a", alleles_str
        ]
        print " ".join(command)

        with CleanupFiles(filenames=[input_filename], files=[output_file]):
            run_command(command)
            results = parse_xls_file(output_file.read(),
                                     peptide_entries,
                                     mutation_window_size=mutation_window_size)

        assert len(results) > 0, "No epitopes from netMHCpan"
        return pd.DataFrame.from_records(results)
Ejemplo n.º 3
0
    def predict(self, df, mutation_window_size = None):
        """
        Given a dataframe of mutated amino acid sequences, run each sequence
        through NetMHCcons.
        If mutation_window_size is not None then only make predictions for that
        number residues away from mutations.

        Expects the input DataFrame to have the following fields:
            - SourceSequence
            - MutationStart
            - MutationEnd
            - GeneInfo
            - Gene
            - GeneMutationInfo
            - PeptideMutationInfo
            - TranscriptId
        """

        input_filename, peptide_entries = create_input_fasta_file(
            df,
            mutation_window_size=mutation_window_size
        )

        output_files = {}
        commands = {}
        dirs = []
        for i, allele in enumerate(self.alleles):
            temp_dirname = tempfile.mkdtemp(prefix="tmp_netmhccons_")
            logging.info("Created temporary directory %s for allele %s",
                allele,
                temp_dirname
            )
            dirs.append(temp_dirname)
            output_file = tempfile.NamedTemporaryFile(
                    "w",
                    prefix="netMHCcons_output_%d" % i,
                    delete=False)
            command = [
                self.netmhc_command,
                    "-length", "9",
                    "-f", input_filename,
                    "-a", allele,
                    '-tdir', temp_dirname]
            commands[output_file] = command

        results = []

        # Cleanup either when finished or if an exception gets raised by
        # deleting the input and output files
        filenames_to_delete = [input_filename]
        for f in output_files.keys():
            filenames_to_delete.append(f.name)

        with CleanupFiles(
                filenames = filenames_to_delete,
                directories = dirs):
            run_multiple_commands_redirect_stdout(
                commands, print_commands = True)
            for output_file, command in commands.iteritems():
                # closing/opening looks insane
                # but I was getting empty files otherwise
                output_file.close()
                with  open(output_file.name, 'r') as f:
                    rows = parse_netmhc_stdout(
                        f.read(),
                        peptide_entries,
                        mutation_window_size=mutation_window_size)
                results.extend(rows)
        assert len(results) > 0, "No epitopes from netMHCcons"
        df = pd.DataFrame.from_records(results)
        unique_alleles = set(df.Allele)
        assert len(unique_alleles) == len(self.alleles), \
            "Expected %d alleles (%s) but got %d (%s)" % (
                len(self.alleles), self.alleles,
                len(unique_alleles), unique_alleles
            )
        return df