Exemple #1
0
    def run(self, filtered_reads_file, debug=False):
        """
        Takes in a (filtered) reads file, returns a dict with the keys:
        command - the command that was run
        corrected_reads - the corrected fastq file
        """
        # command:
        # bfc <flag params> filtered_fastq_file
        mkdir(os.path.join(self.output_dir, "bfc"))
        bfc_output_file = os.path.join(self.output_dir, "bfc",
                                       "bfc_output.fastq")
        bfc_params = ["-1", "-k", "21", "-t", "10"]

        if not debug:
            bfc_params = bfc_params + ["-s", "10g"]
        bfc_params = bfc_params + [filtered_reads_file, ">", bfc_output_file]

        (exit_code, command) = super(BFCRunner, self).run(*bfc_params)

        if exit_code != 0:
            raise RuntimeError("An error occurred while running BFC!")
        return {
            "command": command,
            "corrected_reads": bfc_output_file,
            "version_string": self.version_string()
        }
Exemple #2
0
    def run(self, reads_file, contigs_file):
        """
        Runs BBMap to map the given reads file (FASTQ) to the contigs file (FASTA).
        Returns the paths to the SAM file, coverage stats, and overall BBMap stats as
        map_file, coverage_file, and stats_file, respectively.
        """
        bbmap_output_dir = os.path.join(self.output_dir, "readMappingPairs")
        mkdir(bbmap_output_dir)

        sam_output = os.path.join(bbmap_output_dir, "pairedMapped.sam.gz")
        coverage_stats_output = os.path.join(bbmap_output_dir, "covstats.txt")
        bbmap_stats_output = os.path.join(bbmap_output_dir, "bbmap_stats.txt")

        bbmap_params = [
            "-Xmx100g", "nodisk=true", "interleaved=true", "ambiguous=random",
            "in={}".format(reads_file), "ref={}".format(contigs_file),
            "out={}".format(sam_output),
            "covstats={}".format(coverage_stats_output), "2>",
            bbmap_stats_output
        ]
        (exit_code, command) = super(BBMapRunner, self).run(*bbmap_params)
        if exit_code != 0:
            raise RuntimeError("An error occurred while running BBMap!")
        return {
            "map_file": sam_output,
            "coverage_file": coverage_stats_output,
            "stats_file": bbmap_stats_output,
            "command": command,
            "version_string": self.version_string()
        }
Exemple #3
0
 def run_skip(self, reads_file):
     """
     Doesn't run RQCFilter, but a dummy skip version. It returns the same
     result structure, so it doesn't derail the other pipeline steps. However, the
     "filtered_fastq_file" is the unchanged fastq file, other than gzipping it.
     run_log is just an empty (but existing!) file.
     """
     print("NOT running RQCFilter, just putting together some results.")
     # make the dummy output dir
     outdir = os.path.join(
         self.scratch_dir,
         "dummy_rqcfilter_output_{}".format(int(time() * 1000)))
     mkdir(outdir)
     # mock up a log file
     dummy_log = os.path.join(outdir, "dummy_rqcfilter_log.txt")
     open(dummy_log, 'w').close()
     # just compress the reads and move them into that output dir (probably don't need to
     # move them, but let's be consistent)
     dfu = DataFileUtil(self.callback_url)
     compressed_reads = dfu.pack_file({
         "file_path": reads_file,
         "pack": "gzip"
     })["file_path"]
     base_name = os.path.basename(compressed_reads)
     not_filtered_reads = os.path.join(outdir, base_name)
     os.rename(compressed_reads, not_filtered_reads)
     return {
         "output_directory": outdir,
         "filtered_fastq_file": not_filtered_reads,
         "run_log": dummy_log,
         "command":
         "BBTools.run_RQCFilter_local -- skipped. No command run.",
         "version_string": "KBase BBTools module"
     }
Exemple #4
0
    def test_mkdir_fail(self):
        # try to make an empty path
        with self.assertRaises(ValueError) as cm:
            mkdir(None)
        self.assertIn("A path is required", str(cm.exception))

        # try to make a path that already exists (should fail silently, but not crash
        self.assertTrue(os.path.exists("data"))
        mkdir("data")
Exemple #5
0
 def __init__(self, callback_url, scratch_dir):
     """
     Initialize a few things. Starting points, paths, etc.
     """
     self.callback_url = callback_url
     self.scratch_dir = scratch_dir
     self.timestamp = int(time.time() * 1000)
     self.output_dir = os.path.join(
         self.scratch_dir, "jgi_mga_output_{}".format(self.timestamp))
     mkdir(self.output_dir)
     self.file_util = FileUtil(callback_url)
    def run(self, scaffold_file):
        stats_output_dir = os.path.join(self.output_dir, "assembly_stats")
        mkdir(stats_output_dir)
        stats_output = os.path.join(stats_output_dir,
                                    "assembly.scaffolds.fasta.stats.tsv")
        stats_stdout = os.path.join(stats_output_dir,
                                    "assembly.scaffolds.fasta.stats.txt")
        stats_stderr = os.path.join(stats_output_dir, "stderr.out")

        stats_first_params = [
            "format=6", "in={}".format(scaffold_file), "1>", stats_output,
            "2>", stats_stderr
        ]
        (exit_code, command) = super(StatsRunner,
                                     self).run(*stats_first_params)
        if exit_code != 0:
            raise RuntimeError(
                "Unable to run first pass at stats to generate tab-delimited files!"
            )

        stats_second_params = [
            "in={}".format(scaffold_file), "1>", stats_stdout, "2>>",
            stats_stderr
        ]
        (exit_code, command2) = super(StatsRunner,
                                      self).run(*stats_second_params)
        if exit_code != 0:
            raise RuntimeError(
                "Unable to run second pass at stats to generate standard text files!"
            )

        return {
            "stats_tsv": stats_output,
            "stats_txt": stats_stdout,
            "stats_err": stats_stderr,
            "version_string": self.version_string(),
            "command": command + " && " + command2
        }
Exemple #7
0
 def test_mkdir_ok(self):
     some_path = os.path.join("a_dir", "another_dir", "a_deep_dir")
     self.assertFalse(os.path.exists(some_path))
     mkdir(some_path)
     self.assertTrue(os.path.exists(some_path))
Exemple #8
0
    def run(self, input_file, output_file_name):
        """
        Runs readlength.sh on input_file to generate a file named output_file under the output_dir.
        It then skims that file for several values and returns them as a dictionary. The keys to this return dict are:
        count - the number of reads
        bases - the total number of bases
        max - the length of the longest read
        min - the length of the shortest read
        avg - the average read length
        median - the median read length
        mode - the mode of the mean lengths
        std_dev - the standard deviation of read lengths
        output_file - the output file from readlength, containing a histogram of reads info

        This also calculates the histogram, but it's left out for now. (Unless it's needed later)
        If the output file exists, it will be overwritten.
        """
        if not os.path.exists(input_file):
            raise ValueError(
                "The input file '{}' can't be found!".format(input_file))
        mkdir(os.path.join(self.output_dir, "readlength"))
        output_file_path = os.path.join(self.output_dir, "readlength",
                                        output_file_name)
        readlength_params = [
            "in={}".format(input_file), "1>|", output_file_path
        ]

        (exit_code, command) = super(ReadLengthRunner,
                                     self).run(*readlength_params)
        if exit_code != 0:
            raise RuntimeError("An error occurred while running readlength!")
        if not os.path.exists(output_file_path):
            raise RuntimeError(
                "The output file '{}' appears not to have been made!".format(
                    output_file_path))
        ret_value = dict()
        # This file will have some standard lines, all of which start with a '#'
        # like this:
        #    #Reads:	358
        #    #Bases:	35279
        #    #Max:	100
        #    #Min:	89
        #    #Avg:	98.5
        #    #Median:	100
        #    #Mode:	100
        #    #Std_Dev:	4.9
        #    #Read Length Histogram: (a table follows that we're not using)
        # These get parsed based on their name. #Reads is an int, so parse it that way. #Avg is a float, etc.
        # The parsed numerical values get returned in the output dictionary.
        with open(output_file_path, "r") as read_len_file:
            line_mapping = {
                "#Reads:": ("count", int),
                "#Bases:": ("bases", int),
                "#Max:": ("max", int),
                "#Min:": ("min", int),
                "#Avg:": ("avg", float),
                "#Median:": ("median", int),
                "#Mode:": ("mode", int),
                "#Std_Dev:": ("std_dev", float),
            }
            for line in read_len_file:
                chopped = line.split()
                if chopped[0] in line_mapping:
                    key, map_fn = line_mapping[chopped[0]]
                    ret_value[key] = map_fn(chopped[1])
        ret_value.update({
            "output_file": output_file_path,
            "command": command,
            "version_string": self.version_string()
        })
        return ret_value
Exemple #9
0
    def run(self, input_file, reads_info, options):
        """
        Runs spades, returns the generated output directory name. It's full of standard files.
        This will use (by default) k=33,55,77,99,127.
        However, if the max read length < any of those k, that'll be omitted.
        For example, if your input reads are such that the longest one is 100 bases, this'll
        omit k=127.
        :param input_file: string or path to the input paired-end reads file
        :param reads_info: dict
            - info about the reads from readlength.py. This uses the output_file and avg keys.
        :param options: dict
            - "max_memory" - max allowed memory in GB (default 2000)
        """
        spades_output_dir = os.path.join(self.output_dir, "spades", "spades3")
        mkdir(spades_output_dir)

        spades_kmers = [33, 55, 77, 99, 127]
        used_kmers = [k for k in spades_kmers if k <= reads_info["avg"]]

        max_memory = str(options.get("max_memory", 2000))

        spades_params = [
            "--only-assembler", "-k", ",".join(map(str, used_kmers)), "--meta",
            "-t", "32", "-m", max_memory, "-o", spades_output_dir, "--12",
            input_file
        ]

        print("SPAdes input reads info:\n{}\n".format("=" * 24))
        file_to_log(reads_info["output_file"])
        print("{}\nEnd SPAdes input reads info\n".format("=" * 27))

        (exit_code, command) = super(SpadesRunner, self).run(*spades_params)

        # get the SPAdes logs and cat them to stdout
        print("Done running SPAdes")
        print("See log transcripts below for details")

        log_files = ["warnings.log", "params.txt", "spades.log"]
        for f in log_files:
            log_file = os.path.join(spades_output_dir, f)
            if os.path.exists(log_file):
                print("SPAdes log file {}:\n{}\n".format(
                    f, "=" * (17 + len(f))))
                file_to_log(log_file)
                print("{}\nEnd SPAdes log file {}\n".format(
                    "=" * (20 + len(f)), f))

        if exit_code != 0:
            raise RuntimeError(
                "Errors occurred while running spades. Check the logs for details. Unable to continue pipeline."
            )

        return_dict = {
            "command": command,
            "version_string": self.version_string(),
            "output_dir": spades_output_dir,
            "run_log": os.path.join(spades_output_dir, "spades.log"),
            "params_log": os.path.join(spades_output_dir, "params.txt")
        }
        warnings_log = os.path.join(spades_output_dir, "warnings.log")
        if os.path.exists(warnings_log):
            return_dict["warnings_log"] = warnings_log
        scaffolds = os.path.join(spades_output_dir, "scaffolds.fasta")
        if os.path.exists(scaffolds):
            return_dict["scaffolds_file"] = scaffolds
        contigs = os.path.join(spades_output_dir, "contigs.fasta")
        if os.path.exists(contigs):
            return_dict["contigs_file"] = contigs
        return return_dict