Ejemplo n.º 1
0
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)

        stmnts = []

        prefix = IOTools.snip(outfile, ".vcf.gz")
        vcf_output = prefix + ".raw.vcf.gz"

        if not os.path.exists(vcf_output):
            stmnts.append("java "
                          "-Djava.io.tmpdir=%(tmpdir)s "
                          "-jar {self.path} "
                          "--analysis_type HaplotypeCaller "
                          "--input_file {bam} "
                          "--reference_sequence {reference_fasta} "
                          "--logging_level INFO "
                          "--log_to_file {outfile}.HaplotypeCaller.log "
                          "{params.haplotypecaller} "
                          "--out {vcf_output} "
                          ">& {prefix}.HaplotypeCaller.err".format(**locals()))
        else:
            E.warn("output file {vcf_output} already exists - "
                   "it will not be recomputed".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="5G")
    def run(self, outfile, params):

        retvals = []
        prefix = IOTools.snip(outfile, ".bed.gz")
        vcffile = prefix + ".vcf.gz"
        if not os.path.exists(vcffile):
            retvals.extend(run_tool_delly.run(self, vcffile, params))

        statements = []

        statements.append("{self.path_bcftools} query "
                          "{params.bcftools_options} "
                          "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" "
                          "{vcffile} "
                          "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{"
                          "case \"DEL\": $5=0; break; "
                          "case \"DUP\": $5=3; break; "
                          "case \"INS\": next; break; "
                          "}}; print }}' "
                          "| bgzip "
                          "> {outfile}".format(**locals()))
        statements.append("tabix -f -p bed {outfile}".format(**locals()))

        statement = "; ".join(statements)
        retvals.append(P.run(statement))

        return retvals
Ejemplo n.º 3
0
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)
        stmnts = []
        prefix = IOTools.snip(outfile, ".bam")

        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type RealignerTargetCreator "
            "--input_file {bam} "
            "--reference_sequence {reference_fasta} "
            "--logging_level INFO "
            "--log_to_file {outfile}.RealignerTargetCreator.log "
            "{params.realignertargetcreator} "
            "--out {outfile}.realign.intervals "
            ">& {outfile}.RealignerTargetCreator.err".format(**locals()))

        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type IndelRealigner "
                      "--input_file {bam} "
                      "--reference_sequence {reference_fasta} "
                      "--targetIntervals {outfile}.realign.intervals "
                      "--logging_level INFO "
                      "--log_to_file {outfile}.IndelRealigner.log "
                      "{params.indelrealigner} "
                      "--out @[email protected] "
                      ">& {outfile}.IndelRealigner.err".format(**locals()))

        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type BaseRecalibrator "
                      "--input_file @[email protected] "
                      "--reference_sequence {reference_fasta} "
                      "--logging_level INFO "
                      "{params.baserecalibrator} "
                      "--log_to_file {outfile}.BaseRecalibrator.log "
                      "--out {outfile}.recal_data.table "
                      ">& {outfile}.BaseRecalibrator.err".format(**locals()))

        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type PrintReads "
                      "--input_file @[email protected] "
                      "--reference_sequence {reference_fasta} "
                      "--BQSR {outfile}.recal_data.table "
                      "--logging_level INFO "
                      "--log_to_file {outfile}.PrintReads.log "
                      "--out {outfile} "
                      ">& {outfile}.PrintReads.err".format(**locals()))

        stmnts.append("mv {prefix}.bai {outfile}.bam.bai")

        return self.run_statements(stmnts, job_memory="3G")
Ejemplo n.º 4
0
def build_readgroup_string(outfile, params):

    if params.readgroup_id_regex is None:
        readgroup_id = IOTools.snip(os.path.basename(outfile), ".bam")
    else:
        try:
            readgroup_id = "-".join(re.search(
                params.readgroup_id_regex,
                outfile).groups())
        except AttributeError as ex:
            raise AttributeError("regular expression {} does not match {}".format(
                params.readgroup_id_regex, outfile))

    if params.readgroup_sample_regex is None:
        readgroup_sample = readgroup_id
    else:
        try:
            readgroup_sample = "-".join(re.search(
                params.readgroup_sample_regex,
                outfile).groups())
        except AttributeError as ex:
            raise AttributeError("regular expression {} does not match {}".format(
                params.readgroup_sample_regex, outfile))

    readgroup_string = "@RG\tID:{}\tSM:{}".format(
        readgroup_id, readgroup_sample)

    if params.readgroup_header:
        readgroup_string += "\t{}".format(params.readgroup_header)

    return readgroup_string, readgroup_id, readgroup_sample
Ejemplo n.º 5
0
    def run(self, infile, outfile, params):

        outfile_pass = IOTools.snip(outfile, ".tsv") + "-pass.fastq.gz"
        outfile_fail = IOTools.snip(outfile, ".tsv") + "-fail.fastq.gz"

        statement = ("zcat {infile} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "--output-stats-tsv={outfile} "
                     "- "
                     "| gzip "
                     "> {outfile_pass} "
                     "".format(**locals()))
        return P.run(statement)
Ejemplo n.º 6
0
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")
        bams = resolve_argument(params.bam, ",")
        reference_fasta = get_reference(params)

        statements, gvcfs = [], []
        # TODO: sort out multi-threading
        for idx, bam in enumerate(bams.split(",")):
            output = prefix + "." + str(idx) + ".g.vcf"
            gvcfs.append(output)

            if os.path.exists(output):
                E.info("{} already exists - skipped".format(output))
                continue

            statements.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type HaplotypeCaller "
                "--input_file {bam} "
                "--reference_sequence {reference_fasta} "
                "--emitRefConfidence GVCF "
                "--logging_level INFO "
                "--log_to_file {prefix}.HaplotypeCaller.{idx}.log "
                "{params.haplotypecaller} "
                "--out {output} "
                ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals()))

        if statements:
            self.run_statements(statements, job_memory="4G")

        stmnts = []
        gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs])
        vcf_output = prefix + ".raw.vcf.gz"
        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type GenotypeGVCFs "
                      "--reference_sequence {reference_fasta} "
                      "{gvcfs} "
                      "--logging_level INFO "
                      "--log_to_file {prefix}.GenotypeGVCFs.log "
                      "{params.genotypegvcfs} "
                      "--out {vcf_output} "
                      ">& {prefix}.GenotypeGVCFs".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="4G")
Ejemplo n.º 7
0
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")

        bam = resolve_argument(params.bam, sep=",")
        reference_fasta = get_reference(params)

        bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")])
        stmnts = []
        if not os.path.exists(prefix + ".annotated.vcf.gz"):
            tmpfile, pre_statement, post_statement = self.pre_process(
                params.vcf, outfile, params)

            stmnts.append(pre_statement)
            stmnts.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type VariantAnnotator "
                "--variant {tmpfile} "
                "{bam} "
                "--reference_sequence {reference_fasta} "
                "--logging_level INFO "
                "--log_to_file {prefix}.VariantAnnotator.log "
                "--annotation FisherStrand "
                "--annotation StrandOddsRatio "
                "--annotation ReadPosRankSumTest "
                "--annotation RMSMappingQuality "
                "--annotation MappingQualityRankSumTest "
                "{params.options} "
                "--out {prefix}.annotated.vcf.gz "
                ">& {prefix}.VariantAnnotator.err".format(**locals()))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

            stmnts.append(post_statement)
        else:
            E.warn("using pre-existing file {} with annotated variants".format(
                prefix + ".annotated.vcf.gz"))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

        return self.run_statements(stmnts, job_memory="3G")
Ejemplo n.º 8
0
    def run(self, infiles, outfile, params):

        if not outfile.endswith("-pass.fastq.gz"):
            raise ValueError(
                "outfile must end in -pass.fastq.gz, got {}".format(outfile))

        if params.min_size_bytes:
            before = len(infiles)
            infiles = [
                x for x in infiles
                if os.path.getsize(x) >= params.min_size_bytes
            ]
            E.debug(
                "removing small files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if params.newer_than:
            before = len(infiles)
            cutoff = os.path.getmtime(params.newer_than)
            infiles = [x for x in infiles if os.path.getmtime(x) > cutoff]
            E.debug(
                "removing old files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if len(infiles) == 0:
            E.warn("no files left after filtering, creating empty file")
            IOTools.touch_file(outfile)
            return

        infiles = " ".join(infiles)

        outfile_fail = IOTools.snip(outfile,
                                    "-pass.fastq.gz") + "-fail.fastq.gz"

        statement = ("zcat {infiles} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "- "
                     "| gzip "
                     "> {outfile}".format(**locals()))
        return P.run(statement)
    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=",")

        # "-T {outfile}.tmpdir -k "

        outfile = IOTools.snip(outfile, ".gz")
        # note that lumpy removes the temporary directory
        # after running, thus make sure it is unique and exists
        return P.run("{params.path} "
                     "-B {bam} "
                     "-o {outfile} "
                     "-T %(tmpdir)s_{self.__name__} "
                     "-v "
                     "{params.options} "
                     ">& {outfile}.log; "
                     "vcf-sort {outfile} "
                     "| bgzip > {outfile}.gz; "
                     "tabix -p vcf {outfile}.gz".format(**locals()))
Ejemplo n.º 10
0
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)

        # rename index from x.bai to x.bam.bai
        outprefix = IOTools.snip(outfile, ".bam", ".cram")

        statement = ("java -Xmx8000m -jar {params.path} "
                     "MarkDuplicates "
                     "INPUT={bam} "
                     "TMP_DIR=%(tmpdir)s "
                     "CREATE_INDEX=TRUE "
                     "REFERENCE_SEQUENCE={params.reference_fasta} "
                     "METRICS_FILE={outfile}.metrics "
                     "{params.options} "
                     "OUTPUT={outfile} "
                     ">& {outfile}.log; "
                     "mv {outprefix}.bai {outfile}.bai".format(**locals()))

        # 12G is required for java overhead
        return P.run(statement, job_memory="12G")
Ejemplo n.º 11
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        # ensure output directory exists.
        # This should be done on the pipeline level, but
        # ruffus currently seems not to allow this.
        outdir = os.path.dirname(outfile)
        if outdir and not os.path.exists(outdir):
            os.makedirs(outdir)

        output_files = [
            self.map_table_to_file(x, outfile) for x in self.tablenames
        ]

        kwargs = {
            'output_files': output_files,
            'input_files': infiles,
            'outdir': outdir
        }

        if self._runtime_regex:
            kwargs["alias"] = self.build_alias(str(infiles),
                                               regex=self._runtime_regex,
                                               alias=self._runtime_alias)

        self.save_meta(outfile, **kwargs)

        if self.ignore:
            found = False
            for i in self.ignore:
                if i in outdir:
                    found = True
                    break

            if found:
                E.warn("skipping task {} at runtime, an empty file is created".
                       format(outfile))
                IOTools.touch_file(outfile)
                return

        # if self.runtime_filter:
        # TODO: create empty outfile if regex matches
        #    pass

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    IOTools.snip(outfile) + ".info"))
            return

        # AH: duplicated from above?
        params = self.build_params(output_files=output_files)

        on_error_options = ["raise", "ignore"]
        on_error = params.get("on_error", "raise")
        if on_error not in on_error_options:
            raise ValueError("unknown option to 'on_error': '{}' "
                             "should be one of '{}'".format(
                                 on_error, ",".join(on_error_options)))

        if self.ignore_task(infiles, outfile, params):
            return

        # deal with placeholder files created by identity that are
        # located on a remote mount point
        def map_to_mount(fn):
            if os.path.exists(fn + ".mnt"):
                if not P.PARAMS["mount_point"]:
                    raise ValueError(
                        "encountered mounted file {}, but no mount point present"
                        .format(fn))
                with open(fn + ".mnt") as inf:
                    mount_path = inf.read()
                return os.path.join(P.PARAMS["mount_point"], mount_path)
            else:
                return fn

        # replace infiles with mount locations if necessary
        if isinstance(infiles, list):
            infiles = [map_to_mount(x) for x in infiles]
        else:
            infiles = map_to_mount(infiles)

        try:
            benchmark = self.run(infiles, outfile, as_namedtuple(params))
        except Exception as ex:
            on_error = params.get("on_error", "raise")
            if on_error == "raise":
                raise
            elif on_error == "ignore":
                E.warn(
                    "error occured during execution of {} but will be ignored:\n{}"
                    .format(self.__name__, ex))
                E.warn(
                    "an empty output file {} will be created.".format(outfile))
                IOTools.touch_file(outfile)
                benchmark = None

        if benchmark:
            self.save_benchmark(outfile, benchmark)
Ejemplo n.º 12
0
    def run(self, outfile, params):

        local_options = []
        outfile = os.path.abspath(outfile)
        outdir = os.path.dirname(outfile)

        # assumption is that index is called xyz.fa without the .fa.
        reference_fasta = IOTools.snip(params.reference_fasta, ".fa", ".fasta")
        if not os.path.exists(reference_fasta):
            raise ValueError("input reference {} does not exist".format(reference_fasta))

        if "--jobs" in params.options or "-j" in params.options:
            job_threads = int(re.search("(--jobs|-j)\s*(\d+)",
                                        params.options).groups()[1])
        else:
            job_threads = 8

        if "--memory-limit" in params.options or "-m" in params.options:
            job_memory_gb = int(re.search("(--memory-limit|-m)\s*(\d+)",
                                          params.options).groups()[1])
        else:
            job_memory_gb = 60
            local_options.append("--memory-limit {}".format(job_memory_gb))

        if job_memory_gb < 60:
            E.warn("isaac-align likely to require at least 60Gb of memory, {}G requested".format(
                job_memory_gb))

        job_memory = "{}G".format(float(job_memory_gb) / job_threads)

        fastq_dir = os.path.join(outdir, "input_fastq")
        if not os.path.exists(fastq_dir):
            os.makedirs(fastq_dir)

        if len(params.fastq) == 2:
            if not os.path.exists(os.path.join(fastq_dir, "lane1_read1.fastq.gz")):
                os.symlink(os.path.abspath(params.fastq[0]), os.path.join(fastq_dir, "lane1_read1.fastq.gz"))
            if not os.path.exists(os.path.join(fastq_dir, "lane1_read2.fastq.gz")):
                os.symlink(os.path.abspath(params.fastq[1]), os.path.join(fastq_dir, "lane1_read2.fastq.gz"))
        else:
            raise NotImplementedError("expected 2 fastq files, received only {}".format(len(params.fastq)))

        intermediate_bam = os.path.join(outdir,
                                        "Aligned",
                                        "Projects",
                                        "default",
                                        "default",
                                        "sorted.bam")

        # picard statement to set readgroup
        picard_statement = self.build_picard_statement(
            intermediate_bam,
            outfile,
            params)

        tmpdir = os.path.join(outdir, "TEMP")

        local_options = " ".join(local_options)
        # isaac generates output files in working directory, so do a cd and make
        # sure that absolute path names are used elsewhere.
        statement = (
            "cd {outdir}; "
            "{self.path} "
            "--reference-genome {reference_fasta}/sorted-reference.xml "
            "--base-calls {fastq_dir} "
            "--base-calls-format fastq-gz "
            "--temp-directory {tmpdir} "
            "--cleanup-intermediary 1 "
            "--bam-gzip-level {params.bam_gzip_level} "
            "{params.options} "
            "{local_options} "
            ">& {outfile}.isaac.log; "
            "{picard_statement}; "
            "rm -rf {tmpdir} "
            .format(**locals()))

        return P.run(statement)
Ejemplo n.º 13
0

module_dirs = [os.path.join(os.path.dirname(__file__))]
module_dirs.extend([
    x.strip() for x in os.environ.get("DAISY_TASKLIBRARY", "").split(",")
    if x.strip()
])

modules = []
for idx, root in enumerate(module_dirs):
    for module in glob.glob(os.path.join(root, "*.py")):
        if "flycheck" in module:
            continue
        if module.endswith("__init__.py"):
            continue
        module_name = IOTools.snip(os.path.basename(module))
        if idx == 0:
            modules.append(
                importlib.import_module(
                    "daisy.TaskLibrary.{}".format(module_name)))
        else:
            spec = importlib.util.spec_from_file_location(
                "daisy.UserLibrary.{}".format(module_name), module)
            foo = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(foo)
            modules.append(foo)

# TODO: use derivation instead of name prefix
map_tool_to_runner = dict()
map_metric_to_runner = dict()
map_collate_to_runner = dict()
Ejemplo n.º 14
0
    def pre_process(self, infile, outfile, params):

        statements = []
        infile = IOTools.snip(infile, ".bam")
        tmpdir = P.get_parameters_as_namedtuple().tmpdir
        outprefix = os.path.basename(os.path.dirname(outfile))

        if params.copy_bam:
            statements.append("cp @[email protected] @[email protected]; "
                              "cp @[email protected] @[email protected]")

        if params.split_bam:
            statements.append("daisy bam2bam-split-reads "
                              "-i @[email protected] "
                              "-o - "
                              "{params.split_bam} "
                              "--log={outfile}_split_bam.log "
                              "2> {outfile}_split_bam.err "
                              "> @[email protected]; ".format(**locals()))

        if params.bam2bam:
            statements.append("daisy bam2bam "
                              "--stdin=@[email protected] "
                              "{params.bam2bam} "
                              "--log={outfile}_bam2bam.log "
                              "2> {outfile}_bam2bam.err "
                              "> @[email protected]; ".format(**locals()))

        if params.region:
            statements.append(
                "samtools view -b @[email protected] {} > @[email protected]".format(
                    params.region))

        if params.shift_quality:
            statements.append("samtools view -h @[email protected] "
                              "| perl -lane "
                              "'if(/^@/) {{print; next;}} "
                              "@qual=split(//, $F[10]); "
                              "$_=chr(ord($_)+{}) for (@qual); "
                              "$F[10]=join(\"\",@qual); "
                              "print join(\"\\t\", @F)' "
                              "| samtools view -bS > @[email protected]".format(
                                  params.shift_quality))

        if is_true(params.remove_chr):
            # also substitute chrM to MT.
            statements.append("samtools view -h @[email protected] "
                              "| awk -v OFS='\\t' '"
                              "$1 == \"@SQ\" "
                              "{{ gsub(\"chrM\", \"chrMT\", $2); "
                              "   gsub(\"chr\", \"\", $2); print; next }} "
                              "{{ gsub(\"chrM\", \"chrMT\", $3); "
                              "   gsub(\"chr\", \"\", $3); print; next}} '"
                              "| samtools view -bS - "
                              "2> {outfile}_remove_chr.log "
                              "> @[email protected]; ".format(**locals()))

        if not statements:
            return infile + ".bam", "", ""

        filename, build_statement, cleanup_statement = P.join_statements(
            statements, infile)
        filename += ".bam"
        build_statement += (
            "; samtools index {filename} >& {outfile}.index.log".format(
                **locals()))

        return filename, build_statement, cleanup_statement
Ejemplo n.º 15
0
    def run(self, infile, outfile, params):
        # TODO: bam_fastqc_sequence_length_distribution.tsv may
        # contain ranges such as '30-31'. Convert to beginning of
        # range like in this perl command:
        #
        # perl -p -i -e "s/\-\d+//"
        # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv

        if infile.endswith(".gz"):
            prefix = IOTools.snip(os.path.basename(infile[:-3]))
        else:
            prefix = IOTools.snip(os.path.basename(infile))

        outdir = os.path.dirname(outfile)

        datafile = os.path.join(outdir, "{}_fastqc".format(prefix),
                                "fastqc_data.txt")

        if not os.path.exists(datafile):
            if not os.path.exists(outdir):
                os.makedirs(outdir)

            retval = P.run(
                "{params.path} "
                "{params.options} "
                "--extract "
                "--outdir {outdir} "
                "{infile} "
                ">& {outfile} ".format(**locals()), **params._asdict())
        else:
            IOTools.touch_file(outfile)
            retval = None

        def _split_output(lines):
            body, header, section, status = [], None, None, None
            for line in lines:
                if line.startswith("##FastQC"):
                    continue
                elif line.startswith("#"):
                    header, body = line[1:-1].split("\t"), []
                elif line.startswith(">>END_MODULE"):
                    yield section, header, body, status
                    body, header, section, status = [], None, None, None
                elif line.startswith(">>"):
                    section, status = line[2:-1].split("\t")
                else:
                    fields = line[:-1].split("\t")
                    body.append(fields)

        # split into separate files for upload
        summary_data = []
        with IOTools.open_file(datafile) as inf:
            for section, header, body, status in _split_output(inf):
                if len(body) == 0:
                    continue
                summary_data.append((section, status))
                tablename = "{}_".format(self.name) + re.sub(
                    " ", "_", section).lower()
                if tablename not in self.tablenames:
                    raise ValueError(
                        "unknown tablename {}, expected one of {}".format(
                            tablename, self.tablenames))
                output_file = ".".join((outfile, tablename, "tsv"))
                with open(output_file, "w") as outf:
                    outf.write("\t".join([x.lower() for x in header]) + "\n")
                    # remove first column, which contains the identifier
                    outf.write("\n".join(["\t".join(x) for x in body]) + "\n")

        output_file = ".".join(
            (outfile, "{}_summary".format(self.name), "tsv"))
        with IOTools.open_file(output_file, "w") as outf:
            outf.write("section\tstatus\n")
            for section, status in summary_data:
                outf.write("{}\t{}\n".format(section, status))

        return retval