Beispiel #1
0
def expand_globs(config, is_test=False):
    """detect and expand glob expressions in the input section.

    A glob expression is any filename that contains a '*'. Multiple
    glob expressions can be combined on the same line by a ','.

    A "find" expression is detected starting with 'find'. These
    expressions will be evaluated in a shell and the results insterted
    into the dictionary.

    If a filename starts with "file=", the contents of the file
    following the "=" are read and inserted. Multiple files can be
    separated by a ','.

    If a glob or find expression is evaluated to nothing, an exception
    is raised unless ``is_test`` is set. In that case, two files will be
    returned called "test1" and "test2".
    """

    for d, key, value in IOTools.nested_iter(config):
        if isinstance(value, str):
            if value.startswith("find"):
                try:
                    data = E.run(value, return_stdout=True)
                except Exception as e:
                    data = e.output
                d[key] = [x for x in data.split("\n") if x]
            elif "*" in value:
                if "," in value:
                    v = [glob.glob(x.strip()) for x in value.split(",")]
                    v = [item for sublist in v for item in sublist]
                else:
                    v = glob.glob(value)
                d[key] = v
            elif value.startswith("file="):
                filenames = [x.strip() for x in value.split("=")[1].split(",")]
                paths = []
                for fn in filenames:
                    with IOTools.open_file(fn) as inf:
                        paths.extend([x.strip() for x in inf if x.strip()])
                d[key] = paths
            if len(d[key]) == 0:
                if not is_test:
                    raise ValueError(
                        "expression '{}' expanded to nothing".format(value))
                else:
                    # insert some random files for testing purposes:
                    if "*" in value:
                        # replace glob expressions
                        value = re.sub(",.*", "", value)
                        d[key] = [re.sub("[*]", "test1", value),
                                  re.sub("[*]", "test2", value)]
                    else:
                        if "bam" in value:
                            d[key] = ["test1.bam", "test2.bam"]
                        elif "vcf" in value:
                            d[key] = ["test1.vcf.gz", "test2.vcf.gz"]
                        else:
                            d[key] = ["test1.txt", "test2.txt"]
    return config
 def inner(self, outfile, *args, **kwargs):
     try:
         f()
     except Exception as e:
         E.warn("received exception {} - touching {}".format(
             str(e), outfile))
     IOTools.touch_file(outfile)
    def run(self, infile, outfile, params):

        if "reference_fasta" in params._fields:
            reference_fasta = "REFERENCE_SEQUENCE={}".format(
                params.reference_fasta)
        else:
            reference_fasta = ""

        # command can fail when no output is produced, but still produce output
        # 12G is required for java overhead
        retval = P.run("java -Xmx8000m -jar {params.path} "
                       "CollectMultipleMetrics "
                       "{reference_fasta} "
                       "INPUT={infile} "
                       "TMP_DIR=%(tmpdir)s "
                       "{params.options} "
                       "OUTPUT={outfile} "
                       ">& {outfile} ".format(**locals()),
                       job_memory="12G",
                       ignore_errors=True)

        def get_section(section, data):
            pattern = "## {}".format(section)
            keep = False
            result = []
            for line in data:
                if line.startswith("##"):
                    if line.startswith(pattern):
                        keep = True
                    else:
                        keep = False
                if keep:
                    result.append(line)
            return result

        for tablename in self.tablenames:
            filename = re.sub("histogram", "metrics", tablename)
            raw = filename[len("picard_"):]
            src = outfile + "." + raw
            dest = outfile + "." + tablename + ".tsv"

            if not os.path.exists(src):
                E.warn("no file {}, ignored".format(src))
                continue

            with IOTools.open_file(src) as inf:
                data = inf.readlines()

            if tablename.endswith("metrics"):
                data = get_section("METRICS", data)
            elif tablename.endswith("histogram"):
                data = get_section("HISTOGRAM", data)

            with IOTools.open_file(dest, "w") as outf:
                outf.write("".join(data))

        return retval
Beispiel #4
0
    def run(self, infile, outfile, params):

        if params.reference_fasta is None:
            raise ValueError("please provide a reference database")

        statement = (
            "{params.path_nucmer} -p {outfile} {params.reference_fasta} {infile} >& {outfile}.nucmer; "
            "{params.path_dnadiff} -p {outfile} -d {outfile}.delta >& {outfile}.dnadiff; "
            "{params.path_mummerplot} --large --fat --png {outfile}.1delta >& {outfile}.mummerplot"
            .format(**locals()))

        retval = P.run(statement)
        IOTools.touch_file(outfile)
        return retval
    def ignore_task(self, infiles, outfiles, params):
        """return True if task should be ignored.

        This method will also create the output file(s).
        """
        if self._ignore:
            m = str(outfiles)
            for ignore in IOTools.val2list(self._ignore):
                if ignore in m:
                    E.warn("task {} will be ignored".format(self.__name__))
                    for f in IOTools.val2list(outfiles):
                        E.info("creating empty file {}".format(f))
                        IOTools.touch_file(f)
                    return True
        return False
    def save_benchmark(self, outfile, benchmark):

        if not isinstance(benchmark, list):
            benchmark = [benchmark]

        # flatten if nested list and remove None
        benchmark = [
            x for x in IOTools.flatten(benchmark, ltypes=(list, ))
            if x is not None
        ]

        filename = self.build_meta_filename(outfile, "benchmark.bench")

        if not benchmark:
            E.warn("could not save benchmark info to {}".format(filename))
            return

        try:
            header = benchmark[0]._fields
        except AttributeError as ex:
            E.warn("could not save benchmark timings for {}:"
                   " {} from {}".format(outfile, str(ex), str(benchmark[0])))
            return

        with open(filename, "w") as outf:
            outf.write("\t".join(header) + "\n")
            for b in benchmark:
                outf.write("\t".join(map(str, b)) + "\n")
    def run(self, infile, outfile, params):

        if params.reference_fasta_map is None:
            raise ValueError("bam2reference requires a reference sequence map")

        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)

        fasta = resolve_argument(list(reference_fasta_map.values()),
                                 ",").split(",")
        retval, diff = get_reference_for_bam(infile, fasta)
        if retval is None:
            if diff is None:
                retval = "corrupted"
            else:
                retval = "unknown"
                E.debug("differences: {}".format(str(diff)))
            path = ""
        else:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            path = map_path2name.get(retval, os.path.basename(retval))

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("filename\treference\tpath\n")
            outf.write("\t".join((infile, retval, path)) + "\n")

        return None
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)

        stmnts = []

        prefix = IOTools.snip(outfile, ".vcf.gz")
        vcf_output = prefix + ".raw.vcf.gz"

        if not os.path.exists(vcf_output):
            stmnts.append("java "
                          "-Djava.io.tmpdir=%(tmpdir)s "
                          "-jar {self.path} "
                          "--analysis_type HaplotypeCaller "
                          "--input_file {bam} "
                          "--reference_sequence {reference_fasta} "
                          "--logging_level INFO "
                          "--log_to_file {outfile}.HaplotypeCaller.log "
                          "{params.haplotypecaller} "
                          "--out {vcf_output} "
                          ">& {prefix}.HaplotypeCaller.err".format(**locals()))
        else:
            E.warn("output file {vcf_output} already exists - "
                   "it will not be recomputed".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="5G")
    def run(self, infiles, outfile, params):

        def _link(infile, outfile):
            if os.path.exists(os.path.abspath(outfile)):
                return

            dirname = os.path.dirname(outfile)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
            os.symlink(infile, os.path.abspath(outfile))

        rx = re.compile(params.regex)

        outfiles = []
        for infile in infiles:

            outpath = os.path.join(
                os.path.dirname(outfile),
                rx.search(infile).expand(params.pattern_out))

            for suffix in self.suffixes:
                for fn in glob.glob(infile + suffix):
                    _link(fn, outpath + suffix)
            _link(os.path.abspath(infile), outpath)
            outfiles.append(outpath)

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("\n".join(outfiles) + "\n")
Beispiel #10
0
def expand_generators(config):
    """expand generator expressions in option lists.

    A generator expression are valid python syntax and
    has the following syntax::

      options: generate=["--chrom={}".format(x) for x in [1,2,3,4,5]]

    """

    to_delete = []
    for d, key, value in IOTools.nested_iter(config):
        if isinstance(value, str):
            if value.startswith("generate="):
                expression = re.sub("^generate=\s*", "", value)
                if expression.startswith("'") and expression.startswith("'"):
                    expression = expression[1:-1]
                try:
                    argument_list = eval(expression)
                except SyntaxError as ex:
                    raise ValueError(
                        "error occured while evaluating generator "
                        "expression {}: {}".format(expression, ex))
                if isinstance(d, list):
                    d.extend(argument_list)
                    to_delete.append((d, key))
                else:
                    d[key] = argument_list

    for d, key in to_delete[::-1]:
        del d[key]

    return config
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)
        stmnts = []
        prefix = IOTools.snip(outfile, ".bam")

        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type RealignerTargetCreator "
            "--input_file {bam} "
            "--reference_sequence {reference_fasta} "
            "--logging_level INFO "
            "--log_to_file {outfile}.RealignerTargetCreator.log "
            "{params.realignertargetcreator} "
            "--out {outfile}.realign.intervals "
            ">& {outfile}.RealignerTargetCreator.err".format(**locals()))

        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type IndelRealigner "
                      "--input_file {bam} "
                      "--reference_sequence {reference_fasta} "
                      "--targetIntervals {outfile}.realign.intervals "
                      "--logging_level INFO "
                      "--log_to_file {outfile}.IndelRealigner.log "
                      "{params.indelrealigner} "
                      "--out @[email protected] "
                      ">& {outfile}.IndelRealigner.err".format(**locals()))

        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type BaseRecalibrator "
                      "--input_file @[email protected] "
                      "--reference_sequence {reference_fasta} "
                      "--logging_level INFO "
                      "{params.baserecalibrator} "
                      "--log_to_file {outfile}.BaseRecalibrator.log "
                      "--out {outfile}.recal_data.table "
                      ">& {outfile}.BaseRecalibrator.err".format(**locals()))

        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type PrintReads "
                      "--input_file @[email protected] "
                      "--reference_sequence {reference_fasta} "
                      "--BQSR {outfile}.recal_data.table "
                      "--logging_level INFO "
                      "--log_to_file {outfile}.PrintReads.log "
                      "--out {outfile} "
                      ">& {outfile}.PrintReads.err".format(**locals()))

        stmnts.append("mv {prefix}.bai {outfile}.bam.bai")

        return self.run_statements(stmnts, job_memory="3G")
    def run(self, outfile, params):

        retvals = []
        prefix = IOTools.snip(outfile, ".bed.gz")
        vcffile = prefix + ".vcf.gz"
        if not os.path.exists(vcffile):
            retvals.extend(run_tool_delly.run(self, vcffile, params))

        statements = []

        statements.append("{self.path_bcftools} query "
                          "{params.bcftools_options} "
                          "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" "
                          "{vcffile} "
                          "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{"
                          "case \"DEL\": $5=0; break; "
                          "case \"DUP\": $5=3; break; "
                          "case \"INS\": next; break; "
                          "}}; print }}' "
                          "| bgzip "
                          "> {outfile}".format(**locals()))
        statements.append("tabix -f -p bed {outfile}".format(**locals()))

        statement = "; ".join(statements)
        retvals.append(P.run(statement))

        return retvals
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    tools = glob.glob(
        os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools",
                     "*.py"))

    counter = E.Counter()
    for tool in tools:
        counter.found += 1
        tool_module = re.sub(".py", "", os.path.basename(tool))
        tool_name = re.sub("_", "-", tool_module)
        if tool_name in ("__init__", "cli"):
            c.ignored += 1
            continue

        dest = os.path.join("tools", "{}.rst".format(tool_name))

        if os.path.exists(dest) and not options.output_force:
            counter.skipped += 1
            continue

        with IOTools.openFile(dest, "w") as outf:
            outf.write(TEMPLATE_TOOL.format(**locals()))

        counter.new += 1

    E.info(counter)
    E.stop()
def build_readgroup_string(outfile, params):

    if params.readgroup_id_regex is None:
        readgroup_id = IOTools.snip(os.path.basename(outfile), ".bam")
    else:
        try:
            readgroup_id = "-".join(re.search(
                params.readgroup_id_regex,
                outfile).groups())
        except AttributeError as ex:
            raise AttributeError("regular expression {} does not match {}".format(
                params.readgroup_id_regex, outfile))

    if params.readgroup_sample_regex is None:
        readgroup_sample = readgroup_id
    else:
        try:
            readgroup_sample = "-".join(re.search(
                params.readgroup_sample_regex,
                outfile).groups())
        except AttributeError as ex:
            raise AttributeError("regular expression {} does not match {}".format(
                params.readgroup_sample_regex, outfile))

    readgroup_string = "@RG\tID:{}\tSM:{}".format(
        readgroup_id, readgroup_sample)

    if params.readgroup_header:
        readgroup_string += "\t{}".format(params.readgroup_header)

    return readgroup_string, readgroup_id, readgroup_sample
Beispiel #15
0
    def run(self, infile, outfile, params):

        if params.reference_bed is None:
            raise ValueError("{} requires reference_bed to be set".format(
                self.name))

        # requires a consistent sort order, so sort both files.
        # It also requires the chromosome content to be identical,
        # so restrict output to common sets.
        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.reference_bed)

        statements = [stmnt]
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa "
                          "| bgzip "
                          "> {outfile}.shared.bed.gz")
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_test.bed.gz")
        statements.append("{params.path} intersect "
                          "-b {tmpf_test} "
                          "-a {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_truth.bed.gz")
        statements.append("rm -f {tmpf_test} {tmpf_truth}")

        for section in self.sections:
            statements.append(
                "tabix -p bed {outfile}.{section}.bed.gz".format(**locals()))

        statement = "; ".join(statements)
        retval = P.run(statement.format(**locals()))

        # these are small files, so doing it here. Implement tabix.count()
        # method
        counts = dict()
        for section in self.sections:
            # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf:
            inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz")
            counts[section] = len(list(inf.fetch()))
            inf.close()

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("section\tcounts\n")
            outf.write("\n".join(
                ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n")

        return retval
Beispiel #16
0
    def run(self, infiles, outfile, params):

        if not outfile.endswith("-pass.fastq.gz"):
            raise ValueError(
                "outfile must end in -pass.fastq.gz, got {}".format(outfile))

        if params.min_size_bytes:
            before = len(infiles)
            infiles = [
                x for x in infiles
                if os.path.getsize(x) >= params.min_size_bytes
            ]
            E.debug(
                "removing small files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if params.newer_than:
            before = len(infiles)
            cutoff = os.path.getmtime(params.newer_than)
            infiles = [x for x in infiles if os.path.getmtime(x) > cutoff]
            E.debug(
                "removing old files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if len(infiles) == 0:
            E.warn("no files left after filtering, creating empty file")
            IOTools.touch_file(outfile)
            return

        infiles = " ".join(infiles)

        outfile_fail = IOTools.snip(outfile,
                                    "-pass.fastq.gz") + "-fail.fastq.gz"

        statement = ("zcat {infiles} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "- "
                     "| gzip "
                     "> {outfile}".format(**locals()))
        return P.run(statement)
Beispiel #17
0
    def run(self, infile, outfile, params):

        outfile_pass = IOTools.snip(outfile, ".tsv") + "-pass.fastq.gz"
        outfile_fail = IOTools.snip(outfile, ".tsv") + "-fail.fastq.gz"

        statement = ("zcat {infile} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "--output-stats-tsv={outfile} "
                     "- "
                     "| gzip "
                     "> {outfile_pass} "
                     "".format(**locals()))
        return P.run(statement)
Beispiel #18
0
    def run(self, infile, outfile, params):

        if params.reference_database is None:
            raise ValueError("please provide a reference database")

        statement = (
            "{params.path_lastal} {params.lastal_options} "
            "{params.reference_database} {infile} "
            "| {params.path_lastsplit} {params.lastsplit_options} "
            "| {params.path_mafsort} "
            "| gzip "
            "> {outfile}.maf.gz; "
            "{params.path_lastdotplot} "
            "<(zcat {outfile}.maf.gz "
            "| daisy maf2maf --log={outfile}.filter.log --min-length={params.min_contig_length} ) "
            "{outfile}.png ".format(**locals()))

        retval = P.run(statement, job_memory="15G")
        IOTools.touch_file(outfile)
        return retval
def get_default_params():
    """return default parameters for tools/metrics.

    Could be refactored to read defaults from a user specified file.
    The current implementation takes the one located within the
    repository.
    """

    with IOTools.open_file(
            os.path.join(os.path.dirname(__file__), "defaults.yml")) as inf:
        result = yaml.load(inf, Loader=RoundTripLoader)
    return result
def resolve_argument(argument, sep=","):
    """if argument is a container type (dict, list, tuple)
    resolve its contents to comma-separated list.
    """
    if isinstance(argument, dict):
        if len(argument) != 1:
            raise ValueError(
                "expected a single entry dictionary, got '{}'".format(
                    argument))
        return sep.join(x[2] for x in IOTools.nested_iter(argument))
    elif isinstance(argument, list) or isinstance(argument, tuple):
        return sep.join(argument)
    # special treatment for output from run_collate_link_output
    elif "filelist" in argument:
        f = [
            x.strip() for x in IOTools.open_file(argument).readlines()
            if not x.startswith("#")
        ]
        return sep.join([x for x in f if x])

    return argument
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")
        bams = resolve_argument(params.bam, ",")
        reference_fasta = get_reference(params)

        statements, gvcfs = [], []
        # TODO: sort out multi-threading
        for idx, bam in enumerate(bams.split(",")):
            output = prefix + "." + str(idx) + ".g.vcf"
            gvcfs.append(output)

            if os.path.exists(output):
                E.info("{} already exists - skipped".format(output))
                continue

            statements.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type HaplotypeCaller "
                "--input_file {bam} "
                "--reference_sequence {reference_fasta} "
                "--emitRefConfidence GVCF "
                "--logging_level INFO "
                "--log_to_file {prefix}.HaplotypeCaller.{idx}.log "
                "{params.haplotypecaller} "
                "--out {output} "
                ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals()))

        if statements:
            self.run_statements(statements, job_memory="4G")

        stmnts = []
        gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs])
        vcf_output = prefix + ".raw.vcf.gz"
        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type GenotypeGVCFs "
                      "--reference_sequence {reference_fasta} "
                      "{gvcfs} "
                      "--logging_level INFO "
                      "--log_to_file {prefix}.GenotypeGVCFs.log "
                      "{params.genotypegvcfs} "
                      "--out {vcf_output} "
                      ">& {prefix}.GenotypeGVCFs".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="4G")
 def line_grouper(filename):
     rx = re.compile("\d{4}-\d{2}-\d{2} ")
     with IOTools.open_file(filename) as infile:
         last_line = None
         for line in infile:
             line = line.strip()
             if not rx.match(line):
                 last_line = " ".join((last_line, line))
             else:
                 if last_line:
                     yield last_line
                 last_line = line
         yield last_line
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")

        bam = resolve_argument(params.bam, sep=",")
        reference_fasta = get_reference(params)

        bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")])
        stmnts = []
        if not os.path.exists(prefix + ".annotated.vcf.gz"):
            tmpfile, pre_statement, post_statement = self.pre_process(
                params.vcf, outfile, params)

            stmnts.append(pre_statement)
            stmnts.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type VariantAnnotator "
                "--variant {tmpfile} "
                "{bam} "
                "--reference_sequence {reference_fasta} "
                "--logging_level INFO "
                "--log_to_file {prefix}.VariantAnnotator.log "
                "--annotation FisherStrand "
                "--annotation StrandOddsRatio "
                "--annotation ReadPosRankSumTest "
                "--annotation RMSMappingQuality "
                "--annotation MappingQualityRankSumTest "
                "{params.options} "
                "--out {prefix}.annotated.vcf.gz "
                ">& {prefix}.VariantAnnotator.err".format(**locals()))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

            stmnts.append(post_statement)
        else:
            E.warn("using pre-existing file {} with annotated variants".format(
                prefix + ".annotated.vcf.gz"))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

        return self.run_statements(stmnts, job_memory="3G")
Beispiel #24
0
    def run(self, infile, outfile, params):

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("contig\tcount\tsum\tmin\tmax\tmean\t"
                       "median\tstddev\tcollapse\n")

        retval = P.run("zcat {infile} "
                       "| awk '{{printf(\"%%s\\t%%i\\n\", $1, $3-$2); "
                       " printf(\"total\\t%%i\\n\", $3-$2)}}' "
                       "| sort -k1,1 "
                       "| {params.path} groupby "
                       "-g 1 "
                       "-c 2 "
                       "-o count,sum,min,max,mean,median,stddev,collapse "
                       "{params.options} "
                       "2> {outfile}.log "
                       ">> {outfile}; ".format(**locals()))

        return retval
    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=",")

        # "-T {outfile}.tmpdir -k "

        outfile = IOTools.snip(outfile, ".gz")
        # note that lumpy removes the temporary directory
        # after running, thus make sure it is unique and exists
        return P.run("{params.path} "
                     "-B {bam} "
                     "-o {outfile} "
                     "-T %(tmpdir)s_{self.__name__} "
                     "-v "
                     "{params.options} "
                     ">& {outfile}.log; "
                     "vcf-sort {outfile} "
                     "| bgzip > {outfile}.gz; "
                     "tabix -p vcf {outfile}.gz".format(**locals()))
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)

        # rename index from x.bai to x.bam.bai
        outprefix = IOTools.snip(outfile, ".bam", ".cram")

        statement = ("java -Xmx8000m -jar {params.path} "
                     "MarkDuplicates "
                     "INPUT={bam} "
                     "TMP_DIR=%(tmpdir)s "
                     "CREATE_INDEX=TRUE "
                     "REFERENCE_SEQUENCE={params.reference_fasta} "
                     "METRICS_FILE={outfile}.metrics "
                     "{params.options} "
                     "OUTPUT={outfile} "
                     ">& {outfile}.log; "
                     "mv {outprefix}.bai {outfile}.bai".format(**locals()))

        # 12G is required for java overhead
        return P.run(statement, job_memory="12G")
Beispiel #27
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        if self.mountpoint:
            # revert mount redirection for arvados to allow redirection
            # on individual cluster nodes
            for d, key, value in IOTools.nested_iter(infiles):
                d[key] = re.sub(self.mountpoint, "arv=", value)

        self.instantiate_input(infiles)
        self.save_meta(outfile, output_file=outfile)

        if only_info:
            E.warn("only_info - meta information has been updated")
            return

        params = self.build_params(output_file=outfile)
        benchmark = self.run(outfile, as_namedtuple(params))
        self.save_benchmark(outfile, benchmark)
def collect_file_meta_information(file_dict, nostats=False):
    """collect meta information on files

    Arg:
       file_dict(dict) : nested dictionary

    Returns:
       info(list)
    """
    results = []

    for d, key, filenames in IOTools.nested_iter(file_dict):
        if filenames is None:
            continue

        if isinstance(filenames, str):
            filenames = filenames.split(",")

        filenames = [x.strip() for x in filenames]
        for filename in filenames:
            abspath = os.path.realpath(filename)
            if nostats:
                st_size, st_mtime, st_ctime = 0, 0, 0
            else:
                if not os.path.exists(abspath):
                    raise OSError("file {} does not exist".format(filename))
                s = os.stat(filename)
                st_size, st_mtime, st_ctime = s.st_size, s.st_mtime, s.st_ctime

            results.append(
                collections.OrderedDict(
                    list(
                        zip(("path", "abspath", "size", "modification_time",
                             "creation_time"), (filename, abspath, st_size,
                                                st_mtime, st_ctime)))))

    return results
Beispiel #29
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory TaskLibrary functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt))
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
Beispiel #30
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = Toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = Workflow.expand_globs(config["input"])
    input_combos = Workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()