def run(self, infile, outfile, params):

        if "reference_fasta" in params._fields:
            reference_fasta = "REFERENCE_SEQUENCE={}".format(
                params.reference_fasta)
        else:
            reference_fasta = ""

        # command can fail when no output is produced, but still produce output
        # 12G is required for java overhead
        retval = P.run("java -Xmx8000m -jar {params.path} "
                       "CollectMultipleMetrics "
                       "{reference_fasta} "
                       "INPUT={infile} "
                       "TMP_DIR=%(tmpdir)s "
                       "{params.options} "
                       "OUTPUT={outfile} "
                       ">& {outfile} ".format(**locals()),
                       job_memory="12G",
                       ignore_errors=True)

        def get_section(section, data):
            pattern = "## {}".format(section)
            keep = False
            result = []
            for line in data:
                if line.startswith("##"):
                    if line.startswith(pattern):
                        keep = True
                    else:
                        keep = False
                if keep:
                    result.append(line)
            return result

        for tablename in self.tablenames:
            filename = re.sub("histogram", "metrics", tablename)
            raw = filename[len("picard_"):]
            src = outfile + "." + raw
            dest = outfile + "." + tablename + ".tsv"

            if not os.path.exists(src):
                E.warn("no file {}, ignored".format(src))
                continue

            with IOTools.open_file(src) as inf:
                data = inf.readlines()

            if tablename.endswith("metrics"):
                data = get_section("METRICS", data)
            elif tablename.endswith("histogram"):
                data = get_section("HISTOGRAM", data)

            with IOTools.open_file(dest, "w") as outf:
                outf.write("".join(data))

        return retval
    def run(self, infile, outfile, params):

        if params.reference_fasta_map is None:
            raise ValueError("bam2reference requires a reference sequence map")

        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)

        fasta = resolve_argument(list(reference_fasta_map.values()),
                                 ",").split(",")
        retval, diff = get_reference_for_bam(infile, fasta)
        if retval is None:
            if diff is None:
                retval = "corrupted"
            else:
                retval = "unknown"
                E.debug("differences: {}".format(str(diff)))
            path = ""
        else:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            path = map_path2name.get(retval, os.path.basename(retval))

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("filename\treference\tpath\n")
            outf.write("\t".join((infile, retval, path)) + "\n")

        return None
Exemple #3
0
def expand_globs(config, is_test=False):
    """detect and expand glob expressions in the input section.

    A glob expression is any filename that contains a '*'. Multiple
    glob expressions can be combined on the same line by a ','.

    A "find" expression is detected starting with 'find'. These
    expressions will be evaluated in a shell and the results insterted
    into the dictionary.

    If a filename starts with "file=", the contents of the file
    following the "=" are read and inserted. Multiple files can be
    separated by a ','.

    If a glob or find expression is evaluated to nothing, an exception
    is raised unless ``is_test`` is set. In that case, two files will be
    returned called "test1" and "test2".
    """

    for d, key, value in IOTools.nested_iter(config):
        if isinstance(value, str):
            if value.startswith("find"):
                try:
                    data = E.run(value, return_stdout=True)
                except Exception as e:
                    data = e.output
                d[key] = [x for x in data.split("\n") if x]
            elif "*" in value:
                if "," in value:
                    v = [glob.glob(x.strip()) for x in value.split(",")]
                    v = [item for sublist in v for item in sublist]
                else:
                    v = glob.glob(value)
                d[key] = v
            elif value.startswith("file="):
                filenames = [x.strip() for x in value.split("=")[1].split(",")]
                paths = []
                for fn in filenames:
                    with IOTools.open_file(fn) as inf:
                        paths.extend([x.strip() for x in inf if x.strip()])
                d[key] = paths
            if len(d[key]) == 0:
                if not is_test:
                    raise ValueError(
                        "expression '{}' expanded to nothing".format(value))
                else:
                    # insert some random files for testing purposes:
                    if "*" in value:
                        # replace glob expressions
                        value = re.sub(",.*", "", value)
                        d[key] = [re.sub("[*]", "test1", value),
                                  re.sub("[*]", "test2", value)]
                    else:
                        if "bam" in value:
                            d[key] = ["test1.bam", "test2.bam"]
                        elif "vcf" in value:
                            d[key] = ["test1.vcf.gz", "test2.vcf.gz"]
                        else:
                            d[key] = ["test1.txt", "test2.txt"]
    return config
    def run(self, infiles, outfile, params):

        def _link(infile, outfile):
            if os.path.exists(os.path.abspath(outfile)):
                return

            dirname = os.path.dirname(outfile)
            if not os.path.exists(dirname):
                os.makedirs(dirname)
            os.symlink(infile, os.path.abspath(outfile))

        rx = re.compile(params.regex)

        outfiles = []
        for infile in infiles:

            outpath = os.path.join(
                os.path.dirname(outfile),
                rx.search(infile).expand(params.pattern_out))

            for suffix in self.suffixes:
                for fn in glob.glob(infile + suffix):
                    _link(fn, outpath + suffix)
            _link(os.path.abspath(infile), outpath)
            outfiles.append(outpath)

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("\n".join(outfiles) + "\n")
Exemple #5
0
    def run(self, infile, outfile, params):

        if params.reference_bed is None:
            raise ValueError("{} requires reference_bed to be set".format(
                self.name))

        # requires a consistent sort order, so sort both files.
        # It also requires the chromosome content to be identical,
        # so restrict output to common sets.
        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.reference_bed)

        statements = [stmnt]
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa "
                          "| bgzip "
                          "> {outfile}.shared.bed.gz")
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_test.bed.gz")
        statements.append("{params.path} intersect "
                          "-b {tmpf_test} "
                          "-a {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_truth.bed.gz")
        statements.append("rm -f {tmpf_test} {tmpf_truth}")

        for section in self.sections:
            statements.append(
                "tabix -p bed {outfile}.{section}.bed.gz".format(**locals()))

        statement = "; ".join(statements)
        retval = P.run(statement.format(**locals()))

        # these are small files, so doing it here. Implement tabix.count()
        # method
        counts = dict()
        for section in self.sections:
            # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf:
            inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz")
            counts[section] = len(list(inf.fetch()))
            inf.close()

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("section\tcounts\n")
            outf.write("\n".join(
                ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n")

        return retval
def get_default_params():
    """return default parameters for tools/metrics.

    Could be refactored to read defaults from a user specified file.
    The current implementation takes the one located within the
    repository.
    """

    with IOTools.open_file(
            os.path.join(os.path.dirname(__file__), "defaults.yml")) as inf:
        result = yaml.load(inf, Loader=RoundTripLoader)
    return result
 def line_grouper(filename):
     rx = re.compile("\d{4}-\d{2}-\d{2} ")
     with IOTools.open_file(filename) as infile:
         last_line = None
         for line in infile:
             line = line.strip()
             if not rx.match(line):
                 last_line = " ".join((last_line, line))
             else:
                 if last_line:
                     yield last_line
                 last_line = line
         yield last_line
Exemple #8
0
    def run(self, infile, outfile, params):

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("contig\tcount\tsum\tmin\tmax\tmean\t"
                       "median\tstddev\tcollapse\n")

        retval = P.run("zcat {infile} "
                       "| awk '{{printf(\"%%s\\t%%i\\n\", $1, $3-$2); "
                       " printf(\"total\\t%%i\\n\", $3-$2)}}' "
                       "| sort -k1,1 "
                       "| {params.path} groupby "
                       "-g 1 "
                       "-c 2 "
                       "-o count,sum,min,max,mean,median,stddev,collapse "
                       "{params.options} "
                       "2> {outfile}.log "
                       ">> {outfile}; ".format(**locals()))

        return retval
def resolve_argument(argument, sep=","):
    """if argument is a container type (dict, list, tuple)
    resolve its contents to comma-separated list.
    """
    if isinstance(argument, dict):
        if len(argument) != 1:
            raise ValueError(
                "expected a single entry dictionary, got '{}'".format(
                    argument))
        return sep.join(x[2] for x in IOTools.nested_iter(argument))
    elif isinstance(argument, list) or isinstance(argument, tuple):
        return sep.join(argument)
    # special treatment for output from run_collate_link_output
    elif "filelist" in argument:
        f = [
            x.strip() for x in IOTools.open_file(argument).readlines()
            if not x.startswith("#")
        ]
        return sep.join([x for x in f if x])

    return argument
    def run(self, infile, outfile, params):

        if params.reference_vcf is None:
            raise ValueError("missing input parameter 'reference_vcf'")
        if params.reference_sdf is None:
            raise ValueError("missing input parameter 'reference_sdf'")
        if params.callable_bed is None:
            raise ValueError("missing input parameter 'callable_bed'")

        outfile_regions = outfile + ".bed.gz"
        restrict_bed(outfile_regions,
                     params.callable_bed,
                     infile,
                     remove_chr=params.remove_chr,
                     add_chr=params.add_chr)

        outputdir = os.path.join(os.path.dirname(outfile), "vcfeval.dir")
        if os.path.exists(outputdir):
            shutil.rmtree(outputdir)

        if LooseVersion(self.get_version()) < LooseVersion("3.7"):
            bed_options = "--bed-regions={}".format(params.callable_bed)
            output_columns = [
                "threshold", "true_positive_count", "false_positive_count",
                "false_negative_count", "false_positive_rate",
                "false_negative_rate", "f_measure"
            ]
        else:
            bed_options = "--evaluation-regions={}".format(params.callable_bed)
            output_columns = [
                "threshold", "true_positive_baseline", "true_positive_count",
                "false_positive_count", "false_negative_count",
                "false_positive_rate", "false_negative_rate", "f_measure"
            ]

        retval = self.run_with_preprocessing(
            infile,
            outfile,
            params,
            "{params.path} vcfeval "
            "--calls={infile} "
            "--baseline={params.reference_vcf} "
            "--template={params.reference_sdf} "
            "{bed_options} "
            "--output={outputdir} "
            "{params.options} "
            ">& {outfile}.log ".format(**locals()),
            job_memory="unlimited")

        with IOTools.open_file(os.path.join(outputdir, "summary.txt")) as inf:
            with IOTools.open_file(outfile, "w") as outf:
                table = []
                for line in inf:
                    if line.startswith("-"):
                        continue
                    line = re.sub("^ +", "", line)
                    line = re.sub(" +", "\t", line)
                    fields = line[:-1].split("\t")
                    table.append(fields)

                df = pandas.DataFrame(table[1:], columns=table[0])
                df.columns = output_columns
                # convert precision and sensitivity
                df["false_positive_rate"] = 1.0 - df[
                    "false_positive_rate"].astype(float)
                df["false_negative_rate"] = 1.0 - df[
                    "false_negative_rate"].astype(float)
                df.to_csv(outf, sep="\t", index=False)

        return retval
    def run(self, infile, outfile, params):

        if params.reference_vcf is None:
            raise ValueError("missing input parameter 'reference_vcf'")
        if params.reference_sdf is None:
            raise ValueError("missing input parameter 'reference_sdf'")
        if params.callable_bed is None and params.reference_fasta is None:
            raise ValueError(
                "missing input parameter: either 'callable_bed' or "
                "'reference_fasta' is needed")

        outfile_regions = outfile + ".bed.gz"

        if "callable_bed" in params._fields:
            restrict_bed(outfile_regions,
                         params.callable_bed,
                         infile,
                         remove_chr=params.remove_chr,
                         add_chr=params.add_chr)
        else:
            create_genome_bed(outfile_regions,
                              infile,
                              params.reference_fasta,
                              remove_chr=params.remove_chr,
                              add_chr=params.add_chr)

        with pysam.VariantFile(params.reference_vcf.strip()) as inf:
            try:
                # in some pathological VCF (multiple headers), sample
                # names are not properly read in pysam.
                sample_name = list(inf.header.samples)[0]
            except IndexError:
                sample_name = "TOCOMPARE"

            params = update_namedtuple(params, rename_samples=sample_name)

        outfile_reference = outfile + ".ref.vcf.gz"
        preprocess_reference = self.build_statement_with_preprocessing(
            params.reference_vcf, outfile_reference, params,
            "mv {params.reference_vcf} {outfile_reference}; "
            "tabix -f -p vcf {outfile_reference}".format(**locals()))

        outputdir = os.path.join(os.path.dirname(outfile), "vcfeval.dir")

        if os.path.exists(outputdir):
            shutil.rmtree(outputdir)

        # The java VM does not work with the ulimit -v and ulimit -h
        # options.
        retval = self.run_with_preprocessing(
            infile,
            outfile,
            params,
            "{preprocess_reference}; "
            "{params.path} vcfeval "
            "--calls={infile} "
            "--baseline={outfile_reference} "
            "--template={params.reference_sdf} "
            "--bed-regions={outfile_regions} "
            "--output={outputdir} "
            "--sample={sample_name} "
            ">& {outfile}.log; "
            "rm -f {outfile_reference} {outfile_reference}.tbi ".format(
                **locals()),
            job_memory="unlimited",
        )

        with IOTools.open_file(os.path.join(outputdir, "summary.txt")) as inf:
            with IOTools.open_file(outfile, "w") as outf:
                table = []
                for line in inf:
                    if line.startswith("-"):
                        continue
                    line = re.sub("^ +", "", line)
                    line = re.sub(" +", "\t", line)
                    fields = line[:-1].split("\t")
                    table.append(fields)

                df = pandas.DataFrame(table[1:], columns=table[0])
                df.columns = [
                    "threshold", "true_positive_count", "false_positive_count",
                    "false_negative_count", "false_positive_rate",
                    "false_negative_rate", "f_measure"
                ]
                # convert precision and sensitivity
                df["false_positive_rate"] = 1.0 - df[
                    "false_positive_rate"].astype(float)
                df["false_negative_rate"] = 1.0 - df[
                    "false_negative_rate"].astype(float)
                df.to_csv(outf, sep="\t", index=False)

        return retval
Exemple #12
0
def build_combinations(config):
    """build combinations of configuration parameters

    Return all possible combinations between configuration
    values. 

    There are two types of combinatorics that are applied::

       option1:
         - value1
         - value2
       option2:
         - valueA
         - valueB

    will combine into::

       - option1/value1 x option2/valueA
       - option1/value1 x option2/valueB
       ...

    Values can be grouped on lower levels for those tools expecting
    multiple input files of the same type, for a collection of samples
    to process::

       option1:
          - group1:
             - value1
             - value2
          - group2:
             - value3
             - value4
       option2:
         - valueA
         - valueB

    Will result in::
       - option1/[value1, value2] x option2/valueA
       - option1/[value1, value2] x option2/valueB
       ...

    For tools requiring multiple input files, such as a group of
    samples and a reference sequence, use the following syntax::

      test1:
         bam:
            - value1
            - value2
         reference: value10
      test2:
         bam:
            - value3
            - value4
         reference: value11

      groupby: label
    
    This translates into:
    
       - bam/[value1/values2] x reference/value10
       - bam/[value3/values4] x reference/value11

    Note the ``groupby`` variable indicationg that options
    should be grouped by the top-level (label).

    Configurations values taking multiple values are
    identified by lists, for example::

    >>> build_combinations({'option1': ["value1", "value2"]})
    [{'option1': 'value1'}, {'option1': 'value2'}]
    >>> build_combinations({'option1': ["value1", "value2"], 'option2': 'valueA'})
    [{'option2': 'valueA', 'option1': 'value1'}, {'option2': 'valueA', 'option1': 'value2'}]
    >>> build_combinations({'option1': ["value1", "value2"], 'option2': ["valueA", "valueB"]})
    [{'option2': 'valueA', 'option1': 'value1'}, {'option2': 'valueA', 'option1': 'value2'}, {'option2': 'valueB', 'option1': 'value1'}, {'option2': 'valueB', 'option1': 'value2'}]  # nopep8
    >>> benchmark.Workflow.build_combinations({'option1': [{"value1": [1,2,3]}, {"value2": [4,5,6]}]})
    [{'option1': {'value1': [1, 2, 3]}}, {'option1': {'value2': [4, 5, 6]}}]

    Arg:
        config(dict) : Configuration directory

    Returns:
        list : List of dictionaries

    """

    if not config:
        return [{}]

    groupby = "option"

    if "groupby" in config:
        groupby = config["groupby"].strip()
        if groupby not in ("label", "option", "file"):
            raise ValueError(
                "unknown groupby option '{}', "
                "expected one of {}".format(
                    groupby, str(("label", "option", "file"))))
        del config["groupby"]

    combinations = []
    if groupby == "option":
        # add multiplicity of input files
        try:
            variable = [(k, v) for k, v in list(config.items())
                        if isinstance(v, list)]
        except AttributeError:
            raise ValueError(
                "issue with configuration for option '{}'. "
                "possibly due to supplying option for tool directly and "
                "not using 'options'".format(config))

        variable = [x for x in variable if x[0] not in RESERVED_WORDS]

        if variable:
            constant = [(k, v) for k, v in list(config.items())
                        if not isinstance(v, list)]
            levels = [x[0] for x in variable]
            values = [merge_shared_values(x[1]) for x in variable]
            for combination in itertools.product(*values):
                d = dict(constant + list(zip(levels, combination)))
                combinations.append(d)
        else:
            combinations.append(config)

    elif groupby == "label":
        for k, v in list(config.items()):
            assert isinstance(v, dict)
            combinations.append(v)

    elif groupby == "file":
        # use a design-file to define groups
        if "label" not in config:
            raise ValueError("using file requires a 'label' column to be set")

        label_columns = config["label"]
        if not isinstance(label_columns, list) or isinstance(label_columns, tuple):
            label_columns = [label_columns]

        filelist = config["input"]
        if not isinstance(filelist, list):
            filelist = [filelist]

        if len(filelist) > 1:
            raise NotImplementedError("using multiple files is not implemented")

        for fn in filelist:
            with IOTools.open_file(fn) as inf:
                df = pd.read_table(inf, dtype=str)

        for label_column in label_columns:
            if label_column not in df.columns:
                raise ValueError(
                    "label column {} specified, but does not exist in {}".format(
                        label_column, fn))

        map_column2slot = {}
        shared_values = set()
        columns = set(df.columns)

        for key, value in list(config.items()):
            if key == "label":
                continue

            shared_value = True
            if not isinstance(value, list):
                value = [value]

            for v in value:
                if v in columns and v not in label_columns:
                    map_column2slot[v] = key
                    shared_value = False

            if shared_value and key != "input":
                shared_values.add(key)

        if len(map_column2slot) == 0:
            raise ValueError(
                "no mapping found between column headers ({}) "
                "and slots in config file ({})".format(
                    ",".join(df.columns),
                    ",".join(list(config.keys()))))

        for row in df.iterrows():
            combination = {}
            for shared_value in shared_values:
                combination[shared_value] = config[shared_value]
            dd = dict(row[1])

            for column, slot in list(map_column2slot.items()):
                val = dd[column]
                if ',' in val:
                    val = val.split(',')
                if slot in combination:
                    raise ValueError("duplicate slots: {}".format(slot))
                combination[slot] = val

            combination["name"] = "-".join([re.sub(" ", "_", dd[x]) for x in label_columns])
            combinations.append(combination)

    return combinations
    def run(self, infile, outfile, params):

        options = []
        reference_fasta = params.reference_fasta
        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)
        reference_label = None
        use_target_regions = True
        if params.reference_fasta:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            if params.reference_fasta == "auto":

                fasta = resolve_argument(list(reference_fasta_map.values()),
                                         ",").split(",")

                reference_fasta, diffs = get_reference_for_bam(
                    infile, fastafiles=fasta)

                if reference_fasta:
                    options.append("--ref-seq {}".format(reference_fasta))
                    reference_label = map_path2name[reference_fasta]
                elif diffs:
                    E.warn(
                        "attempted to detect reference fasta, but unable to do so. "
                        "diffs: {}".format(diffs))
                else:
                    E.warn("sequence dict is empty, BAM likely to be empty. "
                           "target_regions will be ignored")
                    use_target_regions = False
            else:
                options.append("--ref-seq {}".format(params.reference_fasta))
                reference_label = map_path2name.get(params.reference_fasta,
                                                    None)

        if params.target_regions and use_target_regions:
            target_regions = get_associated_file(params, reference_label,
                                                 "target_regions")
            # convert to 1-based coordinates and decompress
            if target_regions.endswith(".bed.gz"):
                target_regions = (
                    "<(zcat {} "
                    "| awk '{{printf(\"%%s\\t%%i\\t%%i\\n\", $1, $2+1, $3)}}')"
                    .format(target_regions))
            options.append("--target-regions {}".format(target_regions))

        options = " ".join(options)
        if not os.path.exists(outfile + ".tmp"):
            try:
                retval = P.run("{params.path} stats "
                               "{self.options} "
                               "{options} "
                               "{infile} "
                               "2> {outfile}.log "
                               "> {outfile}.tmp; ".format(**locals()),
                               job_memory="16G")
            except OSError as e:
                E.warn("input file {} gave the following errors: {}".format(
                    infile, str(e)))
                return None
        else:
            retval = None

        def split_output(lines):
            is_comment = True
            section, body = None, []
            for line in lines:
                if line.startswith("#"):
                    if body:
                        yield section, body
                    body = []
                    is_comment = True
                else:
                    # the following preserves new-line
                    line = re.sub("\t#.*", "", line)
                    fields = line[:-1].split("\t")
                    section = fields[0]
                    body.append(fields[1:])
                    is_comment = False

            if body:
                yield section, body

        # split into separate files for upload
        with IOTools.open_file(outfile + ".tmp") as inf:
            for section, body in split_output(inf):
                try:
                    tablename, columns = self._map_section_to_table[section]
                except KeyError:
                    continue

                output_file = self.map_table_to_file(tablename, outfile)
                with IOTools.open_file(output_file, "w") as outf:

                    if len(columns) > 1 and columns[1].startswith("VAR_"):
                        outf.write("{}\t{}\n".format(columns[0],
                                                     columns[1][4:]))
                        for data in body:
                            outf.write("{}\t{}\n".format(
                                data[0], ",".join(data)))
                    else:
                        outf.write("\t".join(columns) + "\n")
                        # remove first column, which contains the identifier
                        outf.write("\n".join(["\t".join(x)
                                              for x in body]) + "\n")

        os.rename(outfile + ".tmp", outfile)

        return retval
    def run(self, infile, outfile, params):
        # TODO: bam_fastqc_sequence_length_distribution.tsv may
        # contain ranges such as '30-31'. Convert to beginning of
        # range like in this perl command:
        #
        # perl -p -i -e "s/\-\d+//"
        # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv

        if infile.endswith(".gz"):
            prefix = IOTools.snip(os.path.basename(infile[:-3]))
        else:
            prefix = IOTools.snip(os.path.basename(infile))

        outdir = os.path.dirname(outfile)

        datafile = os.path.join(outdir, "{}_fastqc".format(prefix),
                                "fastqc_data.txt")

        if not os.path.exists(datafile):
            if not os.path.exists(outdir):
                os.makedirs(outdir)

            retval = P.run(
                "{params.path} "
                "{params.options} "
                "--extract "
                "--outdir {outdir} "
                "{infile} "
                ">& {outfile} ".format(**locals()), **params._asdict())
        else:
            IOTools.touch_file(outfile)
            retval = None

        def _split_output(lines):
            body, header, section, status = [], None, None, None
            for line in lines:
                if line.startswith("##FastQC"):
                    continue
                elif line.startswith("#"):
                    header, body = line[1:-1].split("\t"), []
                elif line.startswith(">>END_MODULE"):
                    yield section, header, body, status
                    body, header, section, status = [], None, None, None
                elif line.startswith(">>"):
                    section, status = line[2:-1].split("\t")
                else:
                    fields = line[:-1].split("\t")
                    body.append(fields)

        # split into separate files for upload
        summary_data = []
        with IOTools.open_file(datafile) as inf:
            for section, header, body, status in _split_output(inf):
                if len(body) == 0:
                    continue
                summary_data.append((section, status))
                tablename = "{}_".format(self.name) + re.sub(
                    " ", "_", section).lower()
                if tablename not in self.tablenames:
                    raise ValueError(
                        "unknown tablename {}, expected one of {}".format(
                            tablename, self.tablenames))
                output_file = ".".join((outfile, tablename, "tsv"))
                with open(output_file, "w") as outf:
                    outf.write("\t".join([x.lower() for x in header]) + "\n")
                    # remove first column, which contains the identifier
                    outf.write("\n".join(["\t".join(x) for x in body]) + "\n")

        output_file = ".".join(
            (outfile, "{}_summary".format(self.name), "tsv"))
        with IOTools.open_file(output_file, "w") as outf:
            outf.write("section\tstatus\n")
            for section, status in summary_data:
                outf.write("{}\t{}\n".format(section, status))

        return retval
    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=" ")
        reference_fasta = get_reference(params)

        if params.parallel:
            statements = []
            files_to_merge = []
            jobsfile = outfile + ".jobs"

            if re.search("--region", params.options):
                region = re.search("--region[= ]*(\S+)",
                                   params.options).groups()[0]
                filter_contig, filter_start, filter_end = parse_region_string(
                    region)
            else:
                filter_contig, filter_start, filter_end = None, None, None

            plain_options = re.sub("--region[= ]\S+", "", params.options)

            statements = []

            with pysam.FastaFile(reference_fasta) as fastaf:
                for contig, length in zip(fastaf.references, fastaf.lengths):
                    if filter_contig and contig != filter_contig:
                        continue
                    begin_range = filter_start if filter_start else 0
                    end_range = filter_end if filter_end else length

                    for start in range(begin_range, end_range,
                                       params.chunk_size):
                        fn = os.path.join(
                            outfile +
                            ".chunk_{}_{:08}.vcf.gz".format(contig, start))
                        files_to_merge.append(fn)
                        if os.path.exists(fn):
                            continue
                        end = min(start + params.chunk_size, length)
                        statements.append(
                            "{params.path} "
                            "--fasta-reference {reference_fasta} "
                            "--region {contig}:{start}-{end} "
                            "{plain_options} "
                            "{bam} "
                            "2> {fn}.log "
                            "| bgzip "
                            "> {fn}\n".format(**locals()))

            retvals = P.run(statements, job_array=True)

            fn = " ".join(files_to_merge)
            statement = ("zcat {fn} "
                         "| vcffirstheader "
                         "2> {outfile}.vcffirstheader.log "
                         "| vcfstreamsort -w 1000 "
                         "2> {outfile}.vcfstreamsort.log "
                         "| vcfuniq "
                         "2> {outfile}.vcfuniq.log "
                         "| bgzip "
                         "2> {outfile}.bgzip.log "
                         "> {outfile}; "
                         "tabix -p vcf {outfile} "
                         "2> {outfile}.tabix.log; "
                         "rm -f {fn} "
                         "".format(**locals()))

            retvals.extend(P.run(statement))

        else:
            # limit number of jobs to node to limit I/O
            job_threads = 2

            retvals = P.run(
                "{params.path} "
                "--fasta-reference {reference_fasta} "
                "{params.options} "
                "{bam} "
                "2> {outfile}.log "
                "| bgzip "
                "> {outfile}; "
                "tabix -p vcf {outfile}".format(**locals()),
                **params._asdict())

        if "set_filter_exclude" in params._fields:
            with IOTools.open_file(outfile + ".header.vcf", "w") as outf:
                outf.write(
                    "##FILTER=<ID=HARD,Description=\"Variant fails hard filters: {}\"> "
                    .format(params.set_filter_exclude))
            job_threads = 1
            # note to include in first step as these will be set to value "HARD"
            retvals.extend(
                P.run("bcftools query "
                      "--include \"{params.set_filter_exclude}\" "
                      "-f \"%%CHROM\\t%%POS\\tHARD\\n\" "
                      "{outfile}.save.vcf.gz "
                      "| bgzip > {outfile}.tab.gz; "
                      "tabix -s 1 -b 2 -e 2 {outfile}.tab.gz; "
                      "bcftools annotate "
                      "-a {outfile}.tab.gz "
                      "-c CHROM,POS,FILTER "
                      "--header-lines {outfile}.header.vcf "
                      "{outfile}.save.vcf.gz "
                      "| bgzip > {outfile}.new.vcf.gz; "
                      "mv {outfile}.new.vcf.gz {outfile}; "
                      "tabix -f -p vcf {outfile} ".format(**locals())))

        return retvals
Exemple #16
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory TaskLibrary functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt))
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
def upload_result(infiles, outfile, *extras):
    """upload results into database.

    Connection details for the database are taken from the
    configuration dictionary given as first argument to extras.  The
    configuration directory should have an element 'database' with the
    required field ``url`` and the optional field ``schema``.  For
    example, to upload to an sqlite database in the current directory
    called csvdb, use::

        config = {"database": {"url": "sqlite:///./csvdb"}}

    Arguments
    ---------
    infiles: list
       List of files to upload. These should be the output
       of metric tasks in a benchmarking workflow.
    outfile: output file
       On success, an empty output file is created.
    extras: list
       List of one element containing a configuration directory
       (see above).

    """

    logger = P.get_logger()

    if len(extras) != 1:
        raise ValueError("expecting only one extra argument "
                         "(configuration dictionary)")

    config = extras[0]

    url = config["database"]["url"]
    is_sqlite3 = url.startswith("sqlite")

    if is_sqlite3:
        connect_args = {'check_same_thread': False}
    else:
        connect_args = {}

    schema = config["database"].get("schema", None)
    # TODO: check if schema exists to avoid incomplete
    # transaction.

    engine = sqlalchemy.create_engine(url, connect_args=connect_args)

    # Catch exceptions until database access on thame available
    try:
        create_database(engine)
    except OperationalError as msg:
        logger.warn("could not connect to database at {}. "
                    "The data will not be uploaded. Msg={}".format(
                        url, str(msg)))
        return

    # Create schema if not exists
    if schema is not None:
        engine.execute(text("CREATE SCHEMA IF NOT EXISTS {}".format(schema)))

    pipeline_name = os.path.basename(sys.argv[0])
    logger.debug("uploading data to {}, schema={}".format(url, schema))
    # TODO: add dependencies
    # dependencies = infiles[1:]
    # meta_data = dict([("dependency{}".format(x), y) \
    #                  for x, y in enumerate(dependencies)])

    # need to set created dir somehow, important when re-loading
    # as otherwise all times will be the same.
    if os.path.exists("benchmark.yml"):
        s = os.stat("benchmark.yml")
        created = datetime.datetime.fromtimestamp(s.st_mtime)
    else:
        created = datetime.datetime.now()

    benchmark_run = BenchmarkRun(
        author=os.environ.get("USER", "unknown"),
        # needs refactoring, should be: uploaded_at, created_at, run_at
        # uploaded_at=datetime.datetime.now(),
        created=created,
        pipeline_name=pipeline_name,
        pipeline_version=P.get_version(),
        pipeline_dir=os.getcwd(),
        title=config["title"],
        description=config["description"],
        config=json.dumps(config),
        config_hash=hash(json.dumps(config)),
        status="incomplete")

    Session = sessionmaker(bind=engine)
    session = Session()
    session.add(benchmark_run)
    session.commit()

    for tag in config["tags"]:
        benchmark_tag = BenchmarkTag(run_id=benchmark_run.id, tag=tag)
        session.add(benchmark_tag)
    session.commit()

    tool_dirs = set()

    table_cache = TableCache(engine, schema, is_sqlite3)

    for infile in infiles:

        path, name = os.path.split(infile)

        # walk up the path to find "benchmark.info" as it might be
        # located on a higher level if the tool output multiple files.
        parts = path.split(os.sep)

        info_paths = []
        rootdir = os.getcwd()
        while len(parts):
            p = os.path.join(*parts)
            if p == rootdir:
                break
            if os.path.exists(os.path.join(p, "benchmark.info")):
                info_paths.append(p)
            parts.pop()
        info_paths = info_paths[::-1]

        # the level of nesting determines the layout:
        # 1 level: aggregation: tool == metric
        # 2 levels: tool + metric
        # 3 levels: tool + split + metric
        if len(info_paths) not in (1, 2, 3):
            raise ValueError("for {}, expected two or three paths with info, "
                             "got {}".format(infile, len(info_paths)))

        meta_data = {}

        if len(info_paths) == 1:
            tool_dir = metric_dir = info_paths[0]
            split_dir = None
        elif len(info_paths) == 2:
            tool_dir, metric_dir = info_paths
            split_dir = None
            # If there are multiple output files in aggregation, use
            # intermediate paths as split_subset factors.
            td = len(tool_dir.split(os.sep))
            tm = len(metric_dir.split(os.sep))
            d = tm - td
            if d > 1:
                meta_data["split_subset"] = re.sub(
                    ".dir", "", os.sep.join(metric_dir.split(os.sep)[td:-1]))
        elif len(info_paths) == 3:
            tool_dir, split_dir, metric_dir = info_paths

        if tool_dir:
            d = read_data(os.path.join(tool_dir, "benchmark.info"),
                          prefix="tool_")
            if "tool_action" in d:
                assert d["tool_action"] == "tool"
            meta_data.update(d)

        if metric_dir:
            d = read_data(os.path.join(metric_dir, "benchmark.info"),
                          prefix="metric_")
            if "metric_action" in d:
                # ignore splits, they will be added through metrics
                if d["metric_action"] == "split":
                    continue
                assert d["metric_action"] == "metric", \
                    "action for metric info {} is not 'metric', but '{}'" \
                    .format(os.path.join(metric_dir, "benchmark.info"),
                            d["metric_action"])

        meta_data.update(d)

        if split_dir:
            d = read_data(os.path.join(split_dir, "benchmark.info"),
                          prefix="split_")
            if "split_action" in d:
                assert d["split_action"] == "split"
            meta_data.update(d)
            subset = os.path.basename(os.path.dirname(info_paths[-1]))
            if subset.endswith(".dir"):
                subset = subset[:-len(".dir")]
            meta_data["split_subset"] = subset

        # tool_input_files can either be a dictionary if a tool
        # or a simple list if aggregation.
        try:
            tool_input_files = [
                x["path"] for x in meta_data["tool_input_files"]
            ]
        except TypeError:
            tool_input_files = meta_data["tool_input_files"]

        try:
            instance = BenchmarkInstance(
                run_id=benchmark_run.id,
                completed=datetime.datetime.fromtimestamp(
                    os.path.getmtime(infile)),
                input=",".join(tool_input_files),
                input_alias=meta_data["tool_input_alias"],
                tool_name=meta_data["tool_name"],
                tool_version=meta_data["tool_version"],
                tool_options=meta_data["tool_options"],
                tool_hash=meta_data["tool_option_hash"],
                tool_alias=meta_data.get("tool_alias", ""),
                metric_name=meta_data["metric_name"],
                metric_version=meta_data["metric_version"],
                metric_options=meta_data["metric_options"],
                metric_hash=meta_data["metric_option_hash"],
                metric_alias=meta_data.get("metric_alias", ""),
                split_name=meta_data.get("split_name", ""),
                split_version=meta_data.get("split_version", ""),
                split_options=meta_data.get("split_options", ""),
                split_hash=meta_data.get("split_option_hash", ""),
                split_alias=meta_data.get("split_alias", ""),
                split_subset=meta_data.get("split_subset", "all"),
                meta_data=json.dumps(meta_data))
        except KeyError as e:
            raise KeyError("missing required attribute {} in {}".format(
                str(e), str(meta_data)))

        session.add(instance)
        session.commit()

        # avoid multiple upload of tool data
        if tool_dir and tool_dir not in tool_dirs:
            tool_dirs.add(tool_dir)
            save_benchmark_timings(tool_dir, "tool_timings", engine, instance,
                                   schema, is_sqlite3)

        save_benchmark_timings(metric_dir, "metric_timings", engine, instance,
                               schema, is_sqlite3)

        metric_table_filter = None
        if "metric_no_upload" in meta_data:
            if meta_data["metric_no_upload"] == "*":
                logger.warn("upload turned off for metric {}".format(
                    meta_data["metric_name"]))
                continue
            else:
                metric_table_filter = re.compile(meta_data["metric_no_upload"])

        # multiple tablenames for multiple metric output
        #
        # Tables are added into schemas to avoid cluttering
        # the public namespace.
        # (if only blobs, no metric output file)
        if "metric_output_files" in meta_data:
            assert len(meta_data["metric_output_files"]) == \
                len(meta_data["metric_tablenames"])

            for output_file, tablename in zip(meta_data["metric_output_files"],
                                              meta_data["metric_tablenames"]):

                if metric_table_filter and metric_table_filter.search(
                        tablename):
                    logger.warn(
                        "upload for table {} turned off".format(tablename))
                    continue

                if not os.path.exists(output_file):
                    logger.warn(
                        "output file {} does not exist - ignored".format(
                            output_file))
                    continue

                if IOTools.is_empty(output_file):
                    logger.warn("output file {} is empty - ignored".format(
                        output_file))
                    continue

                try:
                    table = pandas.read_csv(output_file,
                                            sep="\t",
                                            comment="#",
                                            skip_blank_lines=True)
                except ValueError as e:
                    logger.warn("table {} can not be read: {}".format(
                        output_file, str(e)))
                    continue
                except pandas.parser.CParserError as e:
                    logger.warn(
                        "malformatted table {} can not be read: {}".format(
                            output_file, str(e)))
                    continue

                if len(table) == 0:
                    logger.warn(
                        "table {} is empty - ignored".format(output_file))
                    continue

                tablename, table, dtypes = transform_table_before_upload(
                    tablename, table, instance, meta_data, table_cache)

                if schema is None:
                    tn = tablename
                else:
                    tn = "{}.{}".format(schema, tablename)

                logger.debug("saving data from {} to table {}".format(
                    output_file, tn))
                # add foreign key
                table["instance_id"] = instance.id
                table_cache.add_table(table, tablename, dtypes)

        if "metric_blob_globs" in meta_data:
            metric_dir = meta_data["metric_outdir"]
            files = [
                glob.glob(os.path.join(metric_dir, x))
                for x in meta_data["metric_blob_globs"]
            ]
            files = IOTools.flatten(files)
            logger.debug("uploading binary data in {} files from {} to "
                         "table binary_data".format(len(files), metric_dir))
            table = []
            for fn in files:
                with IOTools.open_file(fn, "rb") as inf:
                    data_row = BenchmarkBinaryData(
                        instance_id=instance.id,
                        filename=os.path.basename(fn),
                        path=fn,
                        data=inf.read())
                    session.add(data_row)
                session.commit()

    table_cache.close()
    touch(outfile)

    # upload table sizes
    df_sizes = pandas.DataFrame.from_records(
        list(table_cache.uploaded_sizes.items()),
        columns=["tablename", "bytes_uploaded"])
    df_sizes["bytes_resident"] = df_sizes.bytes_uploaded
    df_sizes["run_id"] = benchmark_run.id
    df_sizes["schema"] = schema
    save_table(df_sizes,
               engine,
               "metric_storage",
               schema=None,
               is_sqlite3=is_sqlite3)

    # check if arvados job
    if Arvados.have_arvados():
        try:
            arv_job_info = arvados.current_job()
        except KeyError:
            arv_job_info = None

        if arv_job_info is not None:
            arv_job = BenchmarkArvadosJob(
                run_id=benchmark_run.id,
                job_uuid=arv_job_info["uuid"],
                owner_uuid=arv_job_info["owner_uuid"])
            session.add(arv_job)
            session.commit()

    benchmark_run.status = "complete"
    session.commit()

    engine.dispose()
    del engine

    logger.info("uploaded results under run_id {}".format(benchmark_run.id))
Exemple #18
0
    def run(self, infiles, outfile, params):

        tmpdir = P.get_temp_dir(clear=True)

        statements = ["mkdir {}".format(tmpdir)]

        if params.remove_fields:
            cleanup_statement = ("| {params.path} annotate "
                                 "-x {params.remove_fields} "
                                 "2> {outfile}_annotate.log ".format(
                                     **locals()))
        else:
            cleanup_statement = ""

        # the current pattern is properly overly specific and
        # substitutes ./. with 0/0
        if params.set_missing_genotype_to_reference:
            set_genotype = "| perl -p -e 's/\.\/\./0\/0/g'"
        else:
            set_genotype = ""

        with IOTools.open_file(outfile + ".filelist_blocks", "w") as blockf:

            for start in range(0, len(infiles), self.block_size):

                fn = outfile + ".filelist_{}".format(start)
                fn_vcf = os.path.join(tmpdir, "block_{}.vcf.gz".format(start))
                with IOTools.open_file(fn, "w") as outf:
                    end = start + self.block_size
                    outf.write("\n".join(infiles[start:end]) + "\n")

                statements.append("{params.path} merge "
                                  "{params.options} "
                                  "-O v "
                                  "--file-list {outfile}.filelist_{start} "
                                  "2> {outfile}_merge_{start}.log "
                                  "{cleanup_statement} "
                                  "{set_genotype} "
                                  "| bgzip "
                                  "> {fn_vcf}; "
                                  "tabix -p vcf {fn_vcf}".format(**locals()))

                blockf.write(fn_vcf + "\n")

        if params.restrict_to_all:
            filter_statement = ("| {params.path} filter "
                                "--include \"FORMAT/GT != '.'\" "
                                "-O v "
                                "2> {outfile}_filter.log ".format(**locals()))
        else:
            filter_statement = ""

        statements.append("{params.path} merge "
                          "{params.options} "
                          "-O v "
                          "--file-list {outfile}.filelist_blocks "
                          "2> {outfile}_merge.log "
                          "{filter_statement} "
                          "| bgzip "
                          "> {outfile}; "
                          "tabix -p vcf {outfile} ".format(**locals()))

        statements.append("rm -rf {}".format(tmpdir))

        statement = "; ".join(statements)

        retvals = P.run(statement, **params._asdict())

        return retvals
    def run(self, outfile, params):

        reference_fasta = resolve_argument(params.reference_fasta,
                                           ",").split(",")
        if len(reference_fasta) == 2:
            reference1, reference2 = reference_fasta
        else:
            raise NotImplementedError()

        outfile = os.path.abspath(outfile)
        outfile_fastq = os.path.join(os.path.dirname(outfile),
                                     "result.fastq.gz")

        # build BAM header
        with pysam.FastaFile(reference1) as inf:
            with IOTools.open_file(outfile + ".header.sam", "w") as outf:
                outf.write("@HD\tVN:1.3\tSO:unsorted\n")
                for contig, length in zip(inf.references, inf.lengths):
                    outf.write("@SQ\tSN:{}\tLN:{}\n".format(contig, length))

        # not enough space on tmp
        # tmpdir = P.get_temp_filename(clear=True)
        tmpdir = os.path.join(os.path.dirname(outfile), "tmp")
        statements = []
        statements.append("mkdir -p {tmpdir}")

        if params.use_sample_method:
            fastq_filename = os.path.join(tmpdir, "tmp.fastq")
            if not os.path.exists(fastq_filename):
                if params.set_quality_score and params.set_quality_score.strip(
                ):

                    statements.append(
                        "daisy fastq2fastq "
                        "--quality-offset={params.set_quality_score} "
                        "--log={outfile}.fastq.log "
                        "{params.fastq} "
                        "> {fastq_filename}".format(**locals()))
                else:
                    statements.append(
                        "zcat {params.fastq} > {tmpdir}/tmp.fastq")

            statements.append("cd {tmpdir}")

            statements.append("{params.path} "
                              "{params.options} "
                              "--sample-fastq={tmpdir}/tmp.fastq "
                              "{reference1} "
                              "--prefix=H1 "
                              ">& {outfile}.pbsim1.log")

            statements.append("{params.path} "
                              "{params.options} "
                              "--sample-fastq={tmpdir}/tmp.fastq "
                              "{reference2} "
                              "--prefix=H2 "
                              ">& {outfile}.pbsim2.log")
        else:
            statements.append("cd {tmpdir}")

            statements.append("{params.path} "
                              "{params.options} "
                              "--prefix=H1 "
                              "{reference1} "
                              ">& {outfile}.pbsim1.log")
            statements.append("{params.path} "
                              "{params.options} "
                              "--prefix=H2 "
                              "{reference2} "
                              ">& {outfile}.pbsim2.log")

        statements.append("daisy fastq2fastq "
                          "--input-fastq-file={tmpdir}/H1_0001.fastq "
                          "--output-removed-tsv={outfile}.removed1 "
                          "--set-prefix=H1 "
                          "--method=filter-N "
                          "--log={outfile}.log "
                          "| gzip "
                          "> {outfile_fastq}")

        statements.append("cat {tmpdir}/H1_0001.maf "
                          "| daisy maf2maf "
                          "--input-filter-tsv={outfile}.removed1 "
                          "--log={outfile}.maf.log "
                          "--set-prefix=H1 "
                          "> {tmpdir}/tmp.maf")

        statements.append("daisy fastq2fastq "
                          "--input-fastq-file={tmpdir}/H2_0001.fastq "
                          "--output-removed-tsv={outfile}.removed2 "
                          "--method=filter-N "
                          "--set-prefix=H2 "
                          "--log={outfile}.log "
                          "| gzip "
                          ">> {outfile_fastq}")

        statements.append("cat {tmpdir}/H2_0001.maf "
                          "| daisy maf2maf "
                          "--input-filter-tsv={outfile}.removed1 "
                          "--log={outfile}.maf.log "
                          "--set-prefix=H2 "
                          ">> {tmpdir}/tmp.maf")

        # generalize for chromosomes
        statements.append("maf-convert sam {tmpdir}/tmp.maf "
                          "| grep -v '^@' "
                          "| perl -p -e \"s/ref/22/\" "
                          ">> {tmpdir}/tmp.sam")

        statements.append("cat {outfile}.header.sam {tmpdir}/tmp.sam "
                          "| samtools view -bS "
                          "| samtools sort -T {tmpdir}/ -O bam - > {outfile}")

        statements.append("samtools index {outfile}")

        statements.append("rm -rf {tmpdir}")

        statement = "; ".join(statements).format(**locals())

        return P.run(statement)