def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)

        stmnts = []

        prefix = IOTools.snip(outfile, ".vcf.gz")
        vcf_output = prefix + ".raw.vcf.gz"

        if not os.path.exists(vcf_output):
            stmnts.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type HaplotypeCaller "
                "--input_file {bam} "
                "--reference_sequence {reference_fasta} "
                "--logging_level INFO "
                "--log_to_file {outfile}.HaplotypeCaller.log "
                "{params.haplotypecaller} "
                "--out {vcf_output} "
                ">& {prefix}.HaplotypeCaller.err".format(**locals()))
        else:
            E.warn("output file {vcf_output} already exists - "
                   "it will not be recomputed".format(**locals()))

        stmnts.extend(self.build_calibration_workflow(
            outfile, prefix, vcf_output, params))

        return self.run_statements(stmnts, job_memory="5G")
Exemple #2
0
    def run(self, outfile, params):

        retvals = []
        prefix = IOTools.snip(outfile, ".bed.gz")
        vcffile = prefix + ".vcf.gz"
        if not os.path.exists(vcffile):
            retvals.extend(run_tool_delly.run(self, vcffile, params))

        statements = []

        statements.append(
            "{self.path_bcftools} query "
            "{params.bcftools_options} "
            "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" "
            "{vcffile} "
            "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{"
            "case \"DEL\": $5=0; break; "
            "case \"DUP\": $5=3; break; "
            "case \"INS\": next; break; "
            "}}; print }}' "
            "| bgzip "
            "> {outfile}".format(**locals()))
        statements.append(
            "tabix -f -p bed {outfile}".format(**locals()))

        statement = "; ".join(statements)
        retvals.append(P.run(statement))

        return retvals
Exemple #3
0
def build_readgroup_string(outfile, params):

    if params.readgroup_id_regex is None:
        readgroup_id = IOTools.snip(os.path.basename(outfile), ".bam")
    else:
        try:
            readgroup_id = "-".join(re.search(
                params.readgroup_id_regex,
                outfile).groups())
        except AttributeError as ex:
            raise AttributeError("regular expression {} does not match {}".format(
                params.readgroup_id_regex, outfile))

    if params.readgroup_sample_regex is None:
        readgroup_sample = readgroup_id
    else:
        try:
            readgroup_sample = "-".join(re.search(
                params.readgroup_sample_regex,
                outfile).groups())
        except AttributeError as ex:
            raise AttributeError("regular expression {} does not match {}".format(
                params.readgroup_sample_regex, outfile))

    readgroup_string = "@RG\tID:{}\tSM:{}".format(
        readgroup_id, readgroup_sample)

    if params.readgroup_header:
        readgroup_string += "\t{}".format(params.readgroup_header)

    return readgroup_string, readgroup_id, readgroup_sample
Exemple #4
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--regex-filename",
        dest="regex_filename",
        type="string",
        help="extract column name from filename via regular expression "
        "[%default]")

    parser.add_option("--filter",
                      dest="filters",
                      type="choice",
                      action="append",
                      choices=("PASS", "SNP"),
                      help="apply filters to VCFs when reading "
                      "[%default]")

    parser.set_defaults(
        regex_filename=None,
        filters=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("requiring at least 2 input filenames")

    dfs = []
    for filename in args:
        if options.regex_filename:
            try:
                name = re.search(options.regex_filename, filename).groups()[0]
            except AttributeError:
                raise ValueError(
                    "regular expression '{}' does not match {}".format(
                        options.regex_filename, filename))
        else:
            name = iotools.snip(os.path.basename(filename), ".vcf.gz")

        E.debug("reading data from {}".format(filename))
        df = read_vcf_positions_into_dataframe(filename,
                                               filters=options.filters)
        df[name] = 1
        dfs.append(df)

    ndata = len(dfs)
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = merged_df.merge(df, how="outer")
    merged_df = merged_df.fillna(0)
    ddf = merged_df.drop(["chrom", "pos"], axis=1)
    set_counts = ddf.groupby(by=list(ddf.columns)).size()
    set_counts = set_counts.reset_index()
    set_counts.columns = list(set_counts.columns[:-1]) + ["counts"]

    set_counts.to_csv(options.stdout, sep="\t", index=False)
    E.stop()
Exemple #5
0
def fastqscreen_filename2track(fn):
    """extract track name from fastqc filename.

    Because we deal with both paired end (track.fastq.1_fastqc
    and single end data (track_fastqc), this is a bit cumbersome.
    """
    return re.sub(".fastq.", "-",
                  iotools.snip(os.path.basename(fn), "_screen.txt"))
Exemple #6
0
    def submit_function(*args, **kwargs):

        if "submit" in kwargs and kwargs["submit"]:
            del kwargs["submit"]
            submit_args, args_file = _pickle_args(args, kwargs)
            module_file = os.path.abspath(
                sys.modules[func.__module__].__file__)
            submit(iotools.snip(__file__),
                   "run_pickled",
                   args=[iotools.snip(module_file), function_name, args_file],
                   **submit_args)
        else:
            # remove job contral options before running function
            for x in ("submit", "job_options", "job_queue"):
                if x in kwargs:
                    del kwargs[x]
            return func(*args, **kwargs)
Exemple #7
0
    def run(self, infile, outfile, params):

        outfile_pass = IOTools.snip(outfile, ".tsv") + "-pass.fastq.gz"
        outfile_fail = IOTools.snip(outfile, ".tsv") + "-fail.fastq.gz"

        statement = ("zcat {infile} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "--output-stats-tsv={outfile} "
                     "- "
                     "| gzip "
                     "> {outfile_pass} "
                     "".format(**locals()))
        return P.run(statement)
    def run(self, outfile, params):

        if self.file is None:
            raise ValueError("tool 'identity' requires a 'file'")

        fn = self.file
        if isinstance(fn, list):
            if len(fn) == 1:
                fn = fn[0]
            else:
                raise NotImplementedError(
                    "tool 'identity' called with multiple files: {}".format(
                        fn))

        source_fn = os.path.abspath(fn)

        def touch_and_mark_as_mounted(source, dest):
            o = os.stat(source)
            IOTools.touch_file(dest, times=(o.st_atime, o.st_mtime))
            with open(dest + ".mnt", "w") as outf:
                outf.write(get_mounted_location(source))

        if file_is_mounted(source_fn):
            link_f = touch_and_mark_as_mounted
        else:
            link_f = os.symlink

        if not os.path.exists(outfile):
            link_f(source_fn, outfile)

        if self.add_glob:
            if self.chop_suffix:
                source_fn = IOTools.snip(source_fn, self.chop_suffix)
                outfile = IOTools.snip(outfile, self.chop_suffix)

            prefix = len(os.path.basename(source_fn))

            for fn in glob.glob(source_fn + self.add_glob):
                target = outfile + os.path.basename(fn)[prefix:]
                if not os.path.exists(target):
                    link_f(os.path.abspath(fn), target)
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")
        bams = resolve_argument(params.bam, ",")
        reference_fasta = get_reference(params)

        statements, gvcfs = [], []
        # TODO: sort out multi-threading
        for idx, bam in enumerate(bams.split(",")):
            output = prefix + "." + str(idx) + ".g.vcf"
            gvcfs.append(output)

            if os.path.exists(output):
                E.info("{} already exists - skipped".format(output))
                continue

            statements.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type HaplotypeCaller "
                "--input_file {bam} "
                "--reference_sequence {reference_fasta} "
                "--emitRefConfidence GVCF "
                "--logging_level INFO "
                "--log_to_file {prefix}.HaplotypeCaller.{idx}.log "
                "{params.haplotypecaller} "
                "--out {output} "
                ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals()))

        if statements:
            self.run_statements(statements, job_memory="4G")

        stmnts = []
        gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs])
        vcf_output = prefix + ".raw.vcf.gz"
        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type GenotypeGVCFs "
            "--reference_sequence {reference_fasta} "
            "{gvcfs} "
            "--logging_level INFO "
            "--log_to_file {prefix}.GenotypeGVCFs.log "
            "{params.genotypegvcfs} "
            "--out {vcf_output} "
            ">& {prefix}.GenotypeGVCFs".format(**locals()))

        stmnts.extend(self.build_calibration_workflow(
            outfile, prefix, vcf_output, params))

        return self.run_statements(stmnts, job_memory="4G")
def summarizeFastqScreen(infiles, outfiles):
    all_files = []
    for infile in infiles:
        all_files.extend(glob.glob(iotools.snip(infile, "screen") + "*_screen.txt"))
    if len(all_files) == 0:
        E.warn("no fastqcscreen results to concatenate")
        for x in outfiles:
            iotools.touch_file(x)
        return
    df_summary, df_details = readqc.read_fastq_screen(
        all_files)
    df_summary.to_csv(outfiles[0], sep="\t", index=True)
    df_details.to_csv(outfiles[1], sep="\t", index=True)
Exemple #11
0
    def run(self, infiles, outfile, params):

        if not outfile.endswith("-pass.fastq.gz"):
            raise ValueError(
                "outfile must end in -pass.fastq.gz, got {}".format(outfile))

        if params.min_size_bytes:
            before = len(infiles)
            infiles = [
                x for x in infiles
                if os.path.getsize(x) >= params.min_size_bytes
            ]
            E.debug(
                "removing small files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if params.newer_than:
            before = len(infiles)
            cutoff = os.path.getmtime(params.newer_than)
            infiles = [x for x in infiles if os.path.getmtime(x) > cutoff]
            E.debug(
                "removing old files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if len(infiles) == 0:
            E.warn("no files left after filtering, creating empty file")
            IOTools.touch_file(outfile)
            return

        infiles = " ".join(infiles)

        outfile_fail = IOTools.snip(outfile,
                                    "-pass.fastq.gz") + "-fail.fastq.gz"

        statement = ("zcat {infiles} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "- "
                     "| gzip "
                     "> {outfile}".format(**locals()))
        return P.run(statement)
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")

        bam = resolve_argument(params.bam, sep=",")
        reference_fasta = get_reference(params)

        bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")])
        stmnts = []
        if not os.path.exists(prefix + ".annotated.vcf.gz"):
            tmpfile, pre_statement, post_statement = self.pre_process(
                params.vcf, outfile, params)

            stmnts.append(pre_statement)
            stmnts.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type VariantAnnotator "
                "--variant {tmpfile} "
                "{bam} "
                "--reference_sequence {reference_fasta} "
                "--logging_level INFO "
                "--log_to_file {prefix}.VariantAnnotator.log "
                "--annotation FisherStrand "
                "--annotation StrandOddsRatio "
                "--annotation ReadPosRankSumTest "
                "--annotation RMSMappingQuality "
                "--annotation MappingQualityRankSumTest "
                "{params.options} "
                "--out {prefix}.annotated.vcf.gz "
                ">& {prefix}.VariantAnnotator.err".format(**locals()))

            stmnts.extend(self.build_calibration_workflow(
                outfile, prefix, prefix + ".annotated.vcf.gz", params))

            stmnts.append(post_statement)
        else:
            E.warn("using pre-existing file {} with annotated variants".format(
                prefix + ".annotated.vcf.gz"))

            stmnts.extend(self.build_calibration_workflow(
                outfile, prefix, prefix + ".annotated.vcf.gz", params))

        return self.run_statements(stmnts, job_memory="3G")
Exemple #13
0
    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=",")

        # "-T {outfile}.tmpdir -k "

        outfile = IOTools.snip(outfile, ".gz")
        # note that lumpy removes the temporary directory
        # after running, thus make sure it is unique and exists
        return P.run("{params.path} "
                     "-B {bam} "
                     "-o {outfile} "
                     "-T %(tmpdir)s_{self.__name__} "
                     "-v "
                     "{params.options} "
                     ">& {outfile}.log; "
                     "vcf-sort {outfile} "
                     "| bgzip > {outfile}.gz; "
                     "tabix -p vcf {outfile}.gz"
                     .format(**locals()))
Exemple #14
0
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)

        # rename index from x.bai to x.bam.bai
        outprefix = IOTools.snip(outfile, ".bam", ".cram")

        statement = ("java -Xmx8000m -jar {params.path} "
                     "MarkDuplicates "
                     "INPUT={bam} "
                     "TMP_DIR=%(tmpdir)s "
                     "CREATE_INDEX=TRUE "
                     "REFERENCE_SEQUENCE={params.reference_fasta} "
                     "METRICS_FILE={outfile}.metrics "
                     "{params.options} "
                     "OUTPUT={outfile} "
                     ">& {outfile}.log; "
                     "mv {outprefix}.bai {outfile}.bai".format(**locals()))

        # 12G is required for java overhead
        return P.run(statement, job_memory="12G")
Exemple #15
0
def buildFastQCSummaryStatus(infiles, outfile, datadir):
    '''collect fastqc status results from multiple runs into a single table.

    Arguments
    ---------
    infiles : list
        List of filenames with fastqc output (logging information). The
        track name is derived from that.
    outfile : list
        Output filename in :term:`tsv` format.
    datadir : string
        Location of actual Fastqc output to be parsed.
    track_regex : string
        Regular expression to extract track from filename.
    '''

    outf = iotools.open_file(outfile, "w")
    names = set()
    results = []
    for infile in infiles:
        base_track = iotools.snip(os.path.basename(infile), ".fastqc")
        filename = os.path.join(datadir, base_track + "*_fastqc",
                                "fastqc_data.txt")
        # there can be missing sections
        for fn in glob.glob(filename):
            stats = collections.defaultdict(str)
            for name, status, header, data in FastqcSectionIterator(
                    iotools.open_file(fn)):
                stats[name] = status
            track = fastqc_filename2track(fn)
            results.append((track, fn, stats))
            names.update(list(stats.keys()))

    names = sorted(names)
    outf.write("track\tfilename\t%s\n" % "\t".join(names))
    for track, fn, stats in results:
        outf.write("%s\t%s\t%s\n" %
                   (track, os.path.dirname(fn), "\t".join(stats[x]
                                                          for x in names)))
    outf.close()
Exemple #16
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    modules = []
    for module in glob.glob(os.path.join(os.path.dirname(__file__), "*.py")):
        if os.path.basename(module) in IGNORE:
            continue
        if "flycheck" in module:
            continue
        mod = "daisy.tools.{}".format(snip(os.path.basename(module)))
        modules.append(importlib.import_module(mod))

    script_dict = {}
    for module in modules:
        try:
            f = [y for (x, y) in inspect.getmembers(module) if x == "main"][0]
        except IndexError:
            continue
        name = re.sub("_", "-", module.__name__.split(".")[-1])
        script_dict[name] = f
        for synonym in SYNONYMS.get(name, []):
            script_dict[synonym] = f

    if len(argv) == 1:
        print('\n'.join(sorted(script_dict.keys())))
    else:
        command_key = argv[1]
        command_args = argv[1:]

        command = script_dict[command_key]
        try:
            return command(command_args)
        except:
            print('When running {!r}'.format(command_key))
            raise
Exemple #17
0
    def run(self, outfile, params):

        local_options = []
        outfile = os.path.abspath(outfile)
        outdir = os.path.dirname(outfile)

        # assumption is that index is called xyz.fa without the .fa.
        reference_fasta = IOTools.snip(params.reference_fasta, ".fa", ".fasta")
        if not os.path.exists(reference_fasta):
            raise ValueError("input reference {} does not exist".format(reference_fasta))

        if "--jobs" in params.options or "-j" in params.options:
            job_threads = int(re.search("(--jobs|-j)\s*(\d+)",
                                        params.options).groups()[1])
        else:
            job_threads = 8

        if "--memory-limit" in params.options or "-m" in params.options:
            job_memory_gb = int(re.search("(--memory-limit|-m)\s*(\d+)",
                                          params.options).groups()[1])
        else:
            job_memory_gb = 60
            local_options.append("--memory-limit {}".format(job_memory_gb))

        if job_memory_gb < 60:
            E.warn("isaac-align likely to require at least 60Gb of memory, {}G requested".format(
                job_memory_gb))

        job_memory = "{}G".format(float(job_memory_gb) / job_threads)

        fastq_dir = os.path.join(outdir, "input_fastq")
        if not os.path.exists(fastq_dir):
            os.makedirs(fastq_dir)

        if len(params.fastq) == 2:
            if not os.path.exists(os.path.join(fastq_dir, "lane1_read1.fastq.gz")):
                os.symlink(os.path.abspath(params.fastq[0]), os.path.join(fastq_dir, "lane1_read1.fastq.gz"))
            if not os.path.exists(os.path.join(fastq_dir, "lane1_read2.fastq.gz")):
                os.symlink(os.path.abspath(params.fastq[1]), os.path.join(fastq_dir, "lane1_read2.fastq.gz"))
        else:
            raise NotImplementedError("expected 2 fastq files, received only {}".format(len(params.fastq)))

        intermediate_bam = os.path.join(outdir,
                                        "Aligned",
                                        "Projects",
                                        "default",
                                        "default",
                                        "sorted.bam")

        # picard statement to set readgroup
        picard_statement = self.build_picard_statement(
            intermediate_bam,
            outfile,
            params)

        tmpdir = os.path.join(outdir, "TEMP")

        local_options = " ".join(local_options)
        # isaac generates output files in working directory, so do a cd and make
        # sure that absolute path names are used elsewhere.
        statement = (
            "cd {outdir}; "
            "{self.path} "
            "--reference-genome {reference_fasta}/sorted-reference.xml "
            "--base-calls {fastq_dir} "
            "--base-calls-format fastq-gz "
            "--temp-directory {tmpdir} "
            "--cleanup-intermediary 1 "
            "--bam-gzip-level {params.bam_gzip_level} "
            "{params.options} "
            "{local_options} "
            ">& {outfile}.isaac.log; "
            "{picard_statement}; "
            "rm -rf {tmpdir} "
            .format(**locals()))

        return P.run(statement)

module_dirs = [os.path.join(os.path.dirname(__file__))]
module_dirs.extend([
    x.strip() for x in os.environ.get("DAISY_TASKLIBRARY", "").split(",")
    if x.strip()
])

modules = []
for idx, root in enumerate(module_dirs):
    for module in glob.glob(os.path.join(root, "*.py")):
        if "flycheck" in module:
            continue
        if module.endswith("__init__.py"):
            continue
        module_name = IOTools.snip(os.path.basename(module))
        if idx == 0:
            modules.append(
                importlib.import_module("daisy.tasks.{}".format(module_name)))
        else:
            spec = importlib.util.spec_from_file_location(
                "daisy.UserLibrary.{}".format(module_name), module)
            foo = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(foo)
            modules.append(foo)

# TODO: use derivation instead of name prefix
map_tool_to_runner = dict()
map_metric_to_runner = dict()
map_collate_to_runner = dict()
map_split_to_runner = dict()
Exemple #19
0
def runDE(design_file,
          counts_file,
          outfile,
          outdir,
          method="deseq",
          spike_file=None):
    '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py`

    The job is split into smaller sections. The order of the input
    data is randomized in order to avoid any biases due to chromosomes
    and break up local correlations.

    At the end, a q-value is computed from all results.

    Arguments
    ---------
    design_file : string
        Filename with experimental design
    counts_file : string
        :term:`tsv` formatted file with counts per windows
    outfile : string
       Output filename in :term:`tsv` format.
    outdir : string
       Directory for additional output files.
    method : string
       Method to use. See :mod:`scripts/runExpression.py`.
    spike_file : string
       Filename with spike-in data to add before processing.
    '''

    if spike_file is None:
        statement = "zcat %(counts_file)s"
    else:
        statement = '''cgat combine_tables
        --missing-value=0
        --cat=filename
        --log=%(outfile)s.log
        %(counts_file)s %(spike_file)s
        | cgat csv_cut
        --remove filename
        --log=%(outfile)s.log
        '''

    prefix = iotools.snip(os.path.basename(outfile))
    E.info(prefix)

    # --bashrc=%(pipeline_scriptsdir)s/bashrc.cgat

    # the post-processing strips away the warning,
    # renames the qvalue column to old_qvalue
    # and adds a new qvalue column after recomputing
    # over all windows.
    statement += '''
    | cgat randomize_lines --keep-header=1
    | python -m cgatcore.pipeline.farm
    --method=multiprocessing
    --cluster-options="-l mem_free=16G"
    --cluster-queue=%(cluster_queue)s
    --cluster-num-jobs=%(cluster_num_jobs)i
    --cluster-priority=%(cluster_priority)i
    --cluster-queue-manager=%(cluster_queue_manager)s
    --cluster-memory-resource=%(cluster_memory_resource)s
    --cluster-memory-default=%(cluster_memory_default)s
    --input-header
    --output-header
    --split-at-lines=200000
    --log=%(outfile)s.log
    --output-filename-pattern=%(outdir)s/%%s
    --subdirs
    --output-regex-header="^test_id"
    "python -m cgatpipelines.tasks.expression_runner
              --method=%(method)s
              --tags-tsv-file=%%STDIN%%
              --design-tsv-file=%(design_file)s
              --output-filename-pattern=%%DIR%%%(prefix)s_
              --deseq-fit-type=%(deseq_fit_type)s
              --deseq-dispersion-method=%(deseq_dispersion_method)s
              --deseq-sharing-mode=%(deseq_sharing_mode)s
              --edger-dispersion=%(edger_dispersion)f
              --deseq2-design-formula=%(deseq2_model)s
              --deseq2-contrasts=%(deseq2_contrasts)s
              --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i
              --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i
              --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f
              --deseq2-plot=0 "
    | perl -p -e "s/qvalue/old_qvalue/"
    | cgat table2table
    --log=%(outfile)s.log
    --method=fdr
    --column=pvalue
    --fdr-method=BH
    --fdr-add-column=qvalue
    | gzip
    > %(outfile)s '''
    E.info(statement)

    P.run(statement)
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)
        stmnts = []
        prefix = IOTools.snip(outfile, ".bam")

        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type RealignerTargetCreator "
            "--input_file {bam} "
            "--reference_sequence {reference_fasta} "
            "--logging_level INFO "
            "--log_to_file {outfile}.RealignerTargetCreator.log "
            "{params.realignertargetcreator} "
            "--out {outfile}.realign.intervals "
            ">& {outfile}.RealignerTargetCreator.err".format(**locals()))

        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type IndelRealigner "
            "--input_file {bam} "
            "--reference_sequence {reference_fasta} "
            "--targetIntervals {outfile}.realign.intervals "
            "--logging_level INFO "
            "--log_to_file {outfile}.IndelRealigner.log "
            "{params.indelrealigner} "
            "--out @[email protected] "
            ">& {outfile}.IndelRealigner.err".format(**locals()))

        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type BaseRecalibrator "
            "--input_file @[email protected] "
            "--reference_sequence {reference_fasta} "
            "--logging_level INFO "
            "{params.baserecalibrator} "
            "--log_to_file {outfile}.BaseRecalibrator.log "
            "--out {outfile}.recal_data.table "
            ">& {outfile}.BaseRecalibrator.err".format(**locals()))

        stmnts.append(
            "java "
            "-Djava.io.tmpdir=%(tmpdir)s "
            "-jar {self.path} "
            "--analysis_type PrintReads "
            "--input_file @[email protected] "
            "--reference_sequence {reference_fasta} "
            "--BQSR {outfile}.recal_data.table "
            "--logging_level INFO "
            "--log_to_file {outfile}.PrintReads.log "
            "--out {outfile} "
            ">& {outfile}.PrintReads.err".format(**locals()))

        stmnts.append(
            "mv {prefix}.bai {outfile}.bam.bai")

        return self.run_statements(stmnts, job_memory="3G")
Exemple #21
0
def merge_and_load(infiles,
                   outfile,
                   suffix=None,
                   columns=(0, 1),
                   regex=None,
                   row_wise=True,
                   retry=True,
                   options="",
                   prefixes=None):
    '''merge multiple categorical tables and load into a database.

    The tables are merged and entered row-wise, i.e, the contents of
    each file are a row.

    For example, the statement::

        mergeAndLoad(['file1.txt', 'file2.txt'],
                     "test_table.load")

    with the two files::
        > cat file1.txt
        Category    Result
        length      12
        width       100

        > cat file2.txt
        Category    Result
        length      20
        width       50

    will be added into table ``test_table`` as::
        track   length   width
        file1   12       100
        file2   20       50

    If row-wise is set::
        mergeAndLoad(['file1.txt', 'file2.txt'],
                     "test_table.load", row_wise=True)

    ``test_table`` will be transposed and look like this::
        track    file1 file2
        length   12    20
        width    20    50

    Arguments
    ---------
    infiles : list
        Filenames of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile`.
    suffix : string
        If `suffix` is given, the suffix will be removed from the filenames.
    columns : list
        The columns to be taken. By default, the first two columns are
        taken with the first being the key. Filenames are stored in a
        ``track`` column. Directory names are chopped off.  If
        `columns` is set to None, all columns will be taken. Here,
        column names will receive a prefix given by `prefixes`. If
        `prefixes` is None, the filename will be added as a prefix.
    regex : string
        If set, the full filename will be used to extract a
        track name via the supplied regular expression.
    row_wise : bool
        If set to False, each table will be a column in the resulting
        table.  This is useful if histograms are being merged.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    options : string
        Command line options for the `csv2db.py` script.
    prefixes : list
        If given, the respective prefix will be added to each
        column. The number of `prefixes` and `infiles` needs to be the
        same.

    '''
    if len(infiles) == 0:
        raise ValueError("no files for merging")

    if suffix:
        header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles])
    elif regex:
        header = ",".join(
            ["-".join(re.search(regex, x).groups()) for x in infiles])
    else:
        header = ",".join([os.path.basename(x) for x in infiles])

    header_stmt = "--header-names=%s" % header

    if columns:
        column_filter = "| cut -f %s" % ",".join(
            map(str, [x + 1 for x in columns]))
    else:
        column_filter = ""
        if prefixes:
            assert len(prefixes) == len(infiles)
            header_stmt = "--prefixes=%s" % ",".join(prefixes)
        else:
            header_stmt = "--add-file-prefix"

    if infiles[0].endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s %s )" % (x, column_filter) for x in infiles])
    else:
        filenames = " ".join(
            ["<( cat %s %s )" % (x, column_filter) for x in infiles])

    if row_wise:
        transform = """| perl -p -e "s/bin/track/"
        | python -m cgatcore.table --transpose"""
    else:
        transform = ""

    load_statement = build_load_statement(to_table(outfile),
                                          options="--add-index=track " +
                                          options,
                                          retry=retry)

    statement = """python -m cgatcore.tables
    %(header_stmt)s
    --skip-titles
    --missing-value=0
    --ignore-empty
    %(filenames)s
    %(transform)s
    | %(load_statement)s
    > %(outfile)s
    """
    to_cluster = False
    run(statement)
Exemple #22
0
def run_report(clean=True,
               with_pipeline_status=True,
               pipeline_status_format="svg"):
    '''run cgatreport.

    This will also run ruffus to create an svg image of the pipeline
    status unless *with_pipeline_status* is set to False. The image
    will be saved into the export directory.

    '''

    params = P.get_params()

    if with_pipeline_status:
        targetdir = params["exportdir"]
        if not os.path.exists(targetdir):
            os.mkdir(targetdir)

        ruffus.pipeline_printout_graph(
            os.path.join(targetdir, "pipeline.%s" % pipeline_status_format),
            pipeline_status_format, ["full"],
            checksum_level=params["ruffus_checksums_level"])

    dirname, basename = os.path.split(P.get_caller().__file__)

    report_engine = params.get("report_engine", "cgatreport")
    assert report_engine in ('sphinxreport', 'cgatreport')

    docdir = os.path.join(dirname, "pipeline_docs",
                          iotools.snip(basename, ".py"))
    themedir = os.path.join(dirname, "pipeline_docs", "themes")
    relpath = os.path.relpath(docdir)
    trackerdir = os.path.join(docdir, "trackers")

    # use a fake X display in order to avoid windows popping up
    # from R plots.
    xvfb_command = iotools.which("xvfb-run")

    # permit multiple servers using -d option
    if xvfb_command:
        xvfb_command += " -d "
    else:
        xvfb_command = ""

    # if there is no DISPLAY variable set, xvfb runs, but
    # exits with error when killing process. Thus, ignore return
    # value.
    # print os.getenv("DISPLAY"), "command=", xvfb_command
    if not os.getenv("DISPLAY"):
        erase_return = "|| true"
    else:
        erase_return = ""

    if os.path.exists("conf.py"):
        conf_dir = os.path.abspath(".")
    else:
        conf_dir = os.path.join(os.path.dirname(__file__), "configuration")

    # in the current version, xvfb always returns with an error, thus
    # ignore these.
    erase_return = "|| true"

    if clean:
        clean = "rm -rf report _cache _static;"
    else:
        clean = ""

    # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as
    # the virtual environment seems to be stripped. It is thus set to
    # the contents of the current sys.path
    syspath = ":".join(sys.path)

    statement = '''
    %(clean)s
    (export SPHINX_DOCSDIR=%(docdir)s;
    export SPHINX_THEMEDIR=%(themedir)s;
    export PYTHONPATH=%(syspath)s;
    %(xvfb_command)s
    %(report_engine)s-build
    --num-jobs=%(report_threads)s
    sphinx-build
    -b html
    -d %(report_doctrees)s
    -c %(conf_dir)s
    -j %(report_threads)s
    %(docdir)s %(report_html)s
    >& report.log %(erase_return)s )
    '''

    P.run(statement)

    E.info(
        'the report is available at %s' %
        os.path.abspath(os.path.join(params['report_html'], "contents.html")))
    def pre_process(self, infile, outfile, params):

        statements = []
        infile = IOTools.snip(infile, ".bam")
        tmpdir = P.get_parameters_as_namedtuple().tmpdir
        outprefix = os.path.basename(os.path.dirname(outfile))

        if params.copy_bam:
            statements.append("cp @[email protected] @[email protected]; "
                              "cp @[email protected] @[email protected]")

        if params.split_bam:
            statements.append("daisy bam2bam-split-reads "
                              "-i @[email protected] "
                              "-o - "
                              "{params.split_bam} "
                              "--log={outfile}_split_bam.log "
                              "2> {outfile}_split_bam.err "
                              "> @[email protected]; ".format(**locals()))

        if params.bam2bam:
            statements.append("daisy bam2bam "
                              "--stdin=@[email protected] "
                              "{params.bam2bam} "
                              "--log={outfile}_bam2bam.log "
                              "2> {outfile}_bam2bam.err "
                              "> @[email protected]; ".format(**locals()))

        if params.region:
            statements.append(
                "samtools view -b @[email protected] {} > @[email protected]".format(
                    params.region))

        if params.shift_quality:
            statements.append("samtools view -h @[email protected] "
                              "| perl -lane "
                              "'if(/^@/) {{print; next;}} "
                              "@qual=split(//, $F[10]); "
                              "$_=chr(ord($_)+{}) for (@qual); "
                              "$F[10]=join(\"\",@qual); "
                              "print join(\"\\t\", @F)' "
                              "| samtools view -bS > @[email protected]".format(
                                  params.shift_quality))

        if is_true(params.remove_chr):
            # also substitute chrM to MT.
            statements.append("samtools view -h @[email protected] "
                              "| awk -v OFS='\\t' '"
                              "$1 == \"@SQ\" "
                              "{{ gsub(\"chrM\", \"chrMT\", $2); "
                              "   gsub(\"chr\", \"\", $2); print; next }} "
                              "{{ gsub(\"chrM\", \"chrMT\", $3); "
                              "   gsub(\"chr\", \"\", $3); print; next}} '"
                              "| samtools view -bS - "
                              "2> {outfile}_remove_chr.log "
                              "> @[email protected]; ".format(**locals()))

        if not statements:
            return infile + ".bam", "", ""

        filename, build_statement, cleanup_statement = P.join_statements(
            statements, infile)
        filename += ".bam"
        build_statement += (
            "; samtools index {filename} >& {outfile}.index.log".format(
                **locals()))

        return filename, build_statement, cleanup_statement
    def run(self, infile, outfile, params):
        # TODO: bam_fastqc_sequence_length_distribution.tsv may
        # contain ranges such as '30-31'. Convert to beginning of
        # range like in this perl command:
        #
        # perl -p -i -e "s/\-\d+//"
        # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv

        if infile.endswith(".gz"):
            prefix = IOTools.snip(os.path.basename(infile[:-3]))
        else:
            prefix = IOTools.snip(os.path.basename(infile))

        outdir = os.path.dirname(outfile)

        datafile = os.path.join(outdir, "{}_fastqc".format(prefix),
                                "fastqc_data.txt")

        if not os.path.exists(datafile):
            if not os.path.exists(outdir):
                os.makedirs(outdir)

            retval = P.run(
                "{params.path} "
                "{params.options} "
                "--extract "
                "--outdir {outdir} "
                "{infile} "
                ">& {outfile} ".format(**locals()), **params._asdict())
        else:
            IOTools.touch_file(outfile)
            retval = None

        def _split_output(lines):
            body, header, section, status = [], None, None, None
            for line in lines:
                if line.startswith("##FastQC"):
                    continue
                elif line.startswith("#"):
                    header, body = line[1:-1].split("\t"), []
                elif line.startswith(">>END_MODULE"):
                    yield section, header, body, status
                    body, header, section, status = [], None, None, None
                elif line.startswith(">>"):
                    section, status = line[2:-1].split("\t")
                else:
                    fields = line[:-1].split("\t")
                    body.append(fields)

        # split into separate files for upload
        summary_data = []
        with IOTools.open_file(datafile) as inf:
            for section, header, body, status in _split_output(inf):
                if len(body) == 0:
                    continue
                summary_data.append((section, status))
                tablename = "{}_".format(self.name) + re.sub(
                    " ", "_", section).lower()
                if tablename not in self.tablenames:
                    raise ValueError(
                        "unknown tablename {}, expected one of {}".format(
                            tablename, self.tablenames))
                output_file = ".".join((outfile, tablename, "tsv"))
                with open(output_file, "w") as outf:
                    outf.write("\t".join([x.lower() for x in header]) + "\n")
                    # remove first column, which contains the identifier
                    outf.write("\n".join(["\t".join(x) for x in body]) + "\n")

        output_file = ".".join(
            (outfile, "{}_summary".format(self.name), "tsv"))
        with IOTools.open_file(output_file, "w") as outf:
            outf.write("section\tstatus\n")
            for section, status in summary_data:
                outf.write("{}\t{}\n".format(section, status))

        return retval
Exemple #25
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        # ensure output directory exists.
        # This should be done on the pipeline level, but
        # ruffus currently seems not to allow this.
        outdir = os.path.dirname(outfile)
        if outdir and not os.path.exists(outdir):
            os.makedirs(outdir)

        output_files = [
            self.map_table_to_file(x, outfile) for x in self.tablenames
        ]

        kwargs = {
            'output_files': output_files,
            'input_files': infiles,
            'outdir': outdir
        }

        if self._runtime_regex:
            kwargs["alias"] = self.build_alias(str(infiles),
                                               regex=self._runtime_regex,
                                               alias=self._runtime_alias)

        self.save_meta(outfile, **kwargs)

        if self.ignore:
            found = False
            for i in self.ignore:
                if i in outdir:
                    found = True
                    break

            if found:
                E.warn("skipping task {} at runtime, an empty file is created".
                       format(outfile))
                IOTools.touch_file(outfile)
                return

        # if self.runtime_filter:
        # TODO: create empty outfile if regex matches
        #    pass

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    IOTools.snip(outfile) + ".info"))
            return

        # AH: duplicated from above?
        params = self.build_params(output_files=output_files)

        on_error_options = ["raise", "ignore"]
        on_error = params.get("on_error", "raise")
        if on_error not in on_error_options:
            raise ValueError("unknown option to 'on_error': '{}' "
                             "should be one of '{}'".format(
                                 on_error, ",".join(on_error_options)))

        if self.ignore_task(infiles, outfile, params):
            return

        # deal with placeholder files created by identity that are
        # located on a remote mount point
        def map_to_mount(fn):
            if os.path.exists(fn + ".mnt"):
                if not P.PARAMS["mount_point"]:
                    raise ValueError(
                        "encountered mounted file {}, but no mount point present"
                        .format(fn))
                with open(fn + ".mnt") as inf:
                    mount_path = inf.read()
                return os.path.join(P.PARAMS["mount_point"], mount_path)
            else:
                return fn

        # replace infiles with mount locations if necessary
        if isinstance(infiles, list):
            infiles = [map_to_mount(x) for x in infiles]
        else:
            infiles = map_to_mount(infiles)

        try:
            benchmark = self.run(infiles, outfile, as_namedtuple(params))
        except Exception as ex:
            on_error = params.get("on_error", "raise")
            if on_error == "raise":
                raise
            elif on_error == "ignore":
                E.warn(
                    "error occured during execution of {} but will be ignored:\n{}"
                    .format(self.__name__, ex))
                E.warn(
                    "an empty output file {} will be created.".format(outfile))
                IOTools.touch_file(outfile)
                benchmark = None

        if benchmark:
            self.save_benchmark(outfile, benchmark)