Ejemplo n.º 1
0
    def customize(cls, pileup, infile, outfile, regions, dependencies=()):
        cat = factory.new("cat")
        cat.add_value("%(IN_VCF)s")
        cat.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE)

        vcffilter = factory.new("vcf_filter")
        vcffilter.add_option("--pileup", "%(IN_PILEUP)s")
        for contig in regions["HomozygousContigs"]:
            vcffilter.add_option("--homozygous-chromosome", contig)
        vcffilter.set_kwargs(IN_PILEUP=pileup, IN_STDIN=cat, OUT_STDOUT=AtomicCmd.PIPE)

        bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile)

        return {"commands": {"cat": cat, "filter": vcffilter, "bgzip": bgzip}}
Ejemplo n.º 2
0
    def customize(cls, reference, infile, bedfile, outfile, pileup_only=False, nbatches=1, dependencies=()):
        params = factory.new("genotype")
        params.add_value("%(IN_BAMFILE)s")
        params.add_value("%(OUT_VCFFILE)s")
        params.set_option("--nbatches", nbatches)

        if bedfile:
            params.set_option("--bedfile", "%(IN_INTERVALS)s")

        if pileup_only:
            params.set_option("--pileup-only")
            # Ignore read-groups for pileup
            params.add_option("--mpileup-argument", "-R", sep="=")

        # Reference sequence (FASTA)
        params.add_option("--mpileup-argument", "-f=%s" % (reference,), sep="=")

        params.set_kwargs(
            IN_BAMFILE=infile,
            IN_INTERVALS=bedfile,
            OUT_VCFFILE=outfile,
            CHECK_SAMTOOLS=SAMTOOLS_VERSION,
            CHECK_BCFTOOLS=BCFTOOLS_VERSION,
        )

        return {"command": params}
Ejemplo n.º 3
0
    def customize(cls, infile, bedfile, outfile, dependencies=()):
        params = factory.new("sample_pileup")
        params.set_option("--genotype", "%(IN_PILEUP)s")
        params.set_option("--intervals", "%(IN_INTERVALS)s")
        params.set_kwargs(IN_PILEUP=infile, IN_INTERVALS=bedfile, OUT_STDOUT=outfile)

        return {"command": params}
Ejemplo n.º 4
0
def setup_basic_batch(args, regions, prefix, func):
    setup = {"files": {}, "temp_files": {}, "procs": {}, "handles": {}}

    try:
        setup["files"]["bed"] = write_bed_file(prefix, regions)
        setup["temp_files"]["bed"] = setup["files"]["bed"]

        filter_builder = factory.new("genotype")
        filter_builder.set_option("--filter-only")
        filter_builder.set_option("--bedfile", setup["files"]["bed"])
        filter_builder.add_option(args.bamfile)
        filter_builder.add_option(args.destination)

        setup["procs"]["filter"] = processes.open_proc(filter_builder.call, stdout=processes.PIPE, close_fds=True)

        setup["handles"]["outfile"] = open(prefix, "w")
        zip_proc = processes.open_proc(["bgzip"], stdin=func(setup), stdout=setup["handles"]["outfile"], close_fds=True)

        setup["procs"]["gzip"] = zip_proc

        return setup
    except:
        sys.stderr.write(traceback.format_exc() + "\n")
        cleanup_batch(setup)
        raise
Ejemplo n.º 5
0
def _build_cat_command(input_file, output_file):
    cat = factory.new("cat")
    cat.set_option("--output", "%(TEMP_OUT_CAT)s")
    cat.add_value("%(IN_ARCHIVE)s")
    cat.set_kwargs(TEMP_OUT_CAT=output_file,
                   IN_ARCHIVE=input_file)
    return cat
Ejemplo n.º 6
0
def _read_sequences(filename):
    cat_call = factory.new("cat")
    cat_call.add_multiple_values((filename, ))
    cat_call = cat_call.finalized_call

    cat = None
    try:
        cat = subprocess.Popen(cat_call,
                               bufsize=io.DEFAULT_BUFFER_SIZE,
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
        qualities = _collect_qualities(cat.stdout, filename)

        return sampling.reservoir_sampling(qualities, 100000)
    except:
        if cat:
            cat.kill()
            cat.wait()
            cat = None
        raise
    finally:
        rc_cat = cat.wait() if cat else 0
        if rc_cat:
            message = "Error running 'paleomix cat':\n" \
                      "  Unicat return-code = %i\n\n%s" \
                      % (rc_cat, cat.stderr.read())
            raise NodeError(message)
Ejemplo n.º 7
0
def _read_sequences(filename):
    cat_call = factory.new("cat")
    cat_call.add_multiple_values((filename,))
    cat_call = cat_call.finalized_call

    cat = None
    try:
        cat = subprocess.Popen(cat_call,
                               bufsize=io.DEFAULT_BUFFER_SIZE,
                               stderr=subprocess.PIPE,
                               stdout=subprocess.PIPE)
        qualities = _collect_qualities(cat.stdout, filename)

        return sampling.reservoir_sampling(qualities, 100000)
    except:
        if cat:
            cat.kill()
            cat.wait()
            cat = None
        raise
    finally:
        rc_cat = cat.wait() if cat else 0
        if rc_cat:
            message = "Error running 'paleomix cat':\n" \
                      "  Unicat return-code = %i\n\n%s" \
                      % (rc_cat, cat.stderr.read())
            raise NodeError(message)
Ejemplo n.º 8
0
    def customize(cls, reference, infile, bedfile, outfile,
                  pileup_only=False, nbatches=1, dependencies=()):
        params = factory.new("genotype")
        params.add_value("%(IN_BAMFILE)s")
        params.add_value("%(OUT_VCFFILE)s")
        params.set_option("--nbatches", nbatches)

        if bedfile:
            params.set_option("--bedfile", "%(IN_INTERVALS)s")

        if pileup_only:
            params.set_option("--pileup-only")
            # Ignore read-groups for pileup
            params.add_option("--mpileup-argument", "-R", sep="=")

        # Reference sequence (FASTA)
        params.add_option("--mpileup-argument",
                          "-f=%s" % (reference,), sep="=")

        params.set_kwargs(IN_BAMFILE=infile,
                          IN_INTERVALS=bedfile,
                          OUT_VCFFILE=outfile,
                          CHECK_SAMTOOLS=SAMTOOLS_VERSION,
                          CHECK_BCFTOOLS=BCFTOOLS_VERSION)

        return {"command": params}
Ejemplo n.º 9
0
def setup_basic_batch(args, regions, prefix, func):
    setup = {"files": {},
             "temp_files": {},
             "procs": {},
             "handles": {}}

    try:
        setup["files"]["bed"] = write_bed_file(prefix, regions)
        setup["temp_files"]["bed"] = setup["files"]["bed"]

        filter_builder = factory.new("genotype")
        filter_builder.set_option("--filter-only")
        filter_builder.set_option("--bedfile", setup["files"]["bed"])
        filter_builder.add_option(args.bamfile)
        filter_builder.add_option(args.destination)

        setup["procs"]["filter"] \
            = processes.open_proc(filter_builder.call,
                                  stdout=processes.PIPE,
                                  close_fds=True)

        setup["handles"]["outfile"] = open(prefix, "w")
        zip_proc = processes.open_proc(["bgzip"],
                                       stdin=func(setup),
                                       stdout=setup["handles"]["outfile"],
                                       close_fds=True)

        setup["procs"]["gzip"] = zip_proc

        return setup
    except:
        sys.stderr.write(traceback.format_exc() + "\n")
        cleanup_batch(setup)
        raise
Ejemplo n.º 10
0
def _build_cat_command(input_files, output_file):
    cat = factory.new("cat")
    cat.set_option("--output", "%(TEMP_OUT_CAT)s")
    cat.set_kwargs(TEMP_OUT_CAT=output_file)
    cat.add_multiple_values(input_files)

    return cat.finalize()
Ejemplo n.º 11
0
    def __init__(self, config, target_name, input_files, output_file,
                 regions_file=None, dependencies=()):
        bam_input = MultiBAMInput(config, input_files)
        if len(bam_input.files) > 1 and regions_file:
            raise ValueError("DepthHistogram for regions require single, "
                             "indexed input BAM file.")

        builder = factory.new("depths")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(OUT_FILE=output_file)
        bam_input.setup(builder)

        if regions_file:
            builder.set_option('--regions-file', '%(IN_REGIONS)s')
            builder.set_kwargs(IN_REGIONS=regions_file)

        command = ParallelCmds(bam_input.commands + [builder.finalize()])
        description = "<DepthHistogram: %s -> '%s'>" \
            % (describe_files(bam_input.files), output_file)
        MultiBAMInputNode.__init__(self,
                                   bam_input=bam_input,
                                   command=command,
                                   description=description,
                                   dependencies=dependencies)
Ejemplo n.º 12
0
    def customize(cls, infile, bedfile, outfile, dependencies=()):
        params = factory.new("sample_pileup")
        params.set_option("--genotype", "%(IN_PILEUP)s")
        params.set_option("--intervals", "%(IN_INTERVALS)s")
        params.set_kwargs(IN_PILEUP=infile,
                          IN_INTERVALS=bedfile,
                          OUT_STDOUT=outfile)

        return {"command": params}
Ejemplo n.º 13
0
    def customize(cls, infile, bedfile, outfile, padding, dependencies=()):
        params = factory.new("vcf_to_fasta")
        params.set_option("--padding", padding)
        params.set_option("--genotype", "%(IN_VCFFILE)s")
        params.set_option("--intervals", "%(IN_INTERVALS)s")

        params.set_kwargs(IN_VCFFILE=infile, IN_TABIX=infile + ".tbi", IN_INTERVALS=bedfile, OUT_STDOUT=outfile)

        return {"command": params}
Ejemplo n.º 14
0
    def customize(cls, infile, bedfile, outfile, padding, dependencies=()):
        params = factory.new("vcf_to_fasta")
        params.set_option("--padding", padding)
        params.set_option("--genotype", "%(IN_VCFFILE)s")
        params.set_option("--intervals", "%(IN_INTERVALS)s")

        params.set_kwargs(IN_VCFFILE=infile,
                          IN_TABIX=infile + ".tbi",
                          IN_INTERVALS=bedfile,
                          OUT_STDOUT=outfile)

        return {"command": params}
Ejemplo n.º 15
0
    def customize(cls, pileup, infile, outfile, regions, dependencies=()):
        cat = factory.new("cat")
        cat.add_value("%(IN_VCF)s")
        cat.set_kwargs(IN_VCF=infile,
                       OUT_STDOUT=AtomicCmd.PIPE)

        vcffilter = factory.new("vcf_filter")
        vcffilter.add_option("--pileup", "%(IN_PILEUP)s")
        for contig in regions["HomozygousContigs"]:
            vcffilter.add_option("--homozygous-chromosome", contig)
        vcffilter.set_kwargs(IN_PILEUP=pileup,
                             IN_STDIN=cat,
                             OUT_STDOUT=AtomicCmd.PIPE)

        bgzip = AtomicCmdBuilder(["bgzip"],
                                 IN_STDIN=vcffilter,
                                 OUT_STDOUT=outfile)

        return {"commands": {"cat": cat,
                             "filter": vcffilter,
                             "bgzip": bgzip}}
Ejemplo n.º 16
0
def _build_zip_command(output_format, prefix, name, output=None):
    if output_format not in ("gz", "bz2"):
        message = "Invalid output-format (%r), please select 'gz' or 'bz2'"
        raise CmdError(message % (output_format,))

    basename = os.path.basename(prefix)
    compress = factory.new("zip")
    compress.set_option("--format", output_format)
    compress.add_value("%(TEMP_IN_PIPE)s")
    compress.set_kwargs(TEMP_IN_PIPE=basename + name,
                        OUT_STDOUT=prefix + (output or name) + "." + output_format)

    return compress.finalize()
Ejemplo n.º 17
0
    def __init__(self, config, target_name, input_file, output_file, regions_file=None, dependencies=()):
        builder = factory.new("coverage")
        builder.add_value("%(IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(IN_BAM=input_file, OUT_FILE=output_file)

        if regions_file:
            builder.set_option("--regions-file", "%(IN_REGIONS)s")
            builder.set_kwargs(IN_REGIONS=regions_file)

        description = "<Coverage: %s -> '%s'>" % (input_file, output_file)
        CommandNode.__init__(self, command=builder.finalize(), description=description, dependencies=dependencies)
Ejemplo n.º 18
0
def _build_zip_command(output_format, prefix, name, output=None):
    if output_format not in ("gz", "bz2"):
        message = "Invalid output-format (%r), please select 'gz' or 'bz2'"
        raise CmdError(message % (output_format, ))

    basename = os.path.basename(prefix)
    compress = factory.new("zip")
    compress.set_option("--format", output_format)
    compress.add_value("%(TEMP_IN_PIPE)s")
    compress.set_kwargs(TEMP_IN_PIPE=basename + name,
                        OUT_STDOUT=prefix + (output or name) + "." +
                        output_format)

    return compress.finalize()
Ejemplo n.º 19
0
def _process_output(stdin, output_file, reference, run_fixmate=False):
    convert = factory.new("cleanup")
    convert.set_option("--fasta", "%(IN_FASTA_REF)s")
    convert.set_option("--temp-prefix", "%(TEMP_OUT_PREFIX)s")
    convert.set_kwargs(IN_STDIN=stdin,
                       IN_FASTA_REF=reference,
                       OUT_STDOUT=output_file,
                       TEMP_OUT_PREFIX="bam_cleanup",
                       CHECK_SAMTOOLS=SAMTOOLS_VERSION)

    if run_fixmate:
        convert.set_option('--paired-ended')

    return ["convert"], {"convert": convert}
Ejemplo n.º 20
0
def _process_output(stdin, output_file, reference, run_fixmate=False):
    convert = factory.new("cleanup")
    convert.set_option("--fasta", "%(IN_FASTA_REF)s")
    convert.set_option("--temp-prefix", "%(TEMP_OUT_PREFIX)s")
    convert.set_kwargs(IN_STDIN=stdin,
                       IN_FASTA_REF=reference,
                       OUT_STDOUT=output_file,
                       TEMP_OUT_PREFIX="bam_cleanup",
                       CHECK_SAMTOOLS=SAMTOOLS_VERSION)

    if run_fixmate:
        convert.set_option('--paired-ended')

    return ["convert"], {"convert": convert}
Ejemplo n.º 21
0
    def __init__(self, config, input_files, output_file, dependencies=()):
        bam_input = MultiBAMInput(config, input_files)
        duphist_command = factory.new("duphist")
        duphist_command.add_value("%(TEMP_IN_BAM)s")
        duphist_command.set_kwargs(OUT_STDOUT=output_file)
        bam_input.setup(duphist_command)
        duphist_command = duphist_command.finalize()

        commands = ParallelCmds(bam_input.commands + [duphist_command])

        description = "<DuplicateHistogram: %s -> %r>" % (describe_files(input_files), output_file)
        MultiBAMInputNode.__init__(
            self, bam_input=bam_input, command=commands, description=description, dependencies=dependencies
        )
Ejemplo n.º 22
0
    def __init__(self, config, input_files, output_file, dependencies=()):
        bam_input = MultiBAMInput(config, input_files)
        duphist_command = factory.new("duphist")
        duphist_command.add_value('%(TEMP_IN_BAM)s')
        duphist_command.set_kwargs(OUT_STDOUT=output_file)
        bam_input.setup(duphist_command)
        duphist_command = duphist_command.finalize()

        commands = ParallelCmds(bam_input.commands + [duphist_command])

        description = "<DuplicateHistogram: %s -> %r>" \
            % (describe_files(input_files), output_file)
        MultiBAMInputNode.__init__(self,
                                   bam_input=bam_input,
                                   command=commands,
                                   description=description,
                                   dependencies=dependencies)
Ejemplo n.º 23
0
    def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()):
        bam_input = MultiBAMInput(config, input_bams)

        builder = factory.new("rmdup_collapsed")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.set_kwargs(OUT_STDOUT=output_bam)
        bam_input.setup(builder)

        if not keep_dupes:
            builder.set_option("--remove-duplicates")

        filteruniq = builder.finalize()
        command = ParallelCmds(bam_input.commands + [filteruniq])
        description = "<FilterCollapsedBAM: %s>" % (describe_files(bam_input.files),)
        MultiBAMInputNode.__init__(
            self, bam_input=bam_input, command=command, description=description, dependencies=dependencies
        )
Ejemplo n.º 24
0
    def __init__(self, config, target_name, input_file, output_file,
                 regions_file=None, dependencies=()):
        builder = factory.new("coverage")
        builder.add_value("%(IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(IN_BAM=input_file,
                           OUT_FILE=output_file)

        if regions_file:
            builder.set_option('--regions-file', '%(IN_REGIONS)s')
            builder.set_kwargs(IN_REGIONS=regions_file)

        description = "<Coverage: %s -> '%s'>" % (input_file, output_file)
        CommandNode.__init__(self,
                             command=builder.finalize(),
                             description=description,
                             dependencies=dependencies)
Ejemplo n.º 25
0
    def customize(cls, reference, infile_bam, infile_vcf, outfile,
                  dependencies=()):
        params = factory.new("genotype")
        params.add_value("%(IN_BAMFILE)s")
        params.add_value("%(OUT_PILEUP)s")
        params.set_option("--bedfile", "%(TEMP_IN_INTERVALS)s")
        params.set_option("--pileup-only")
        # Ignore read-groups for pileup
        params.add_option("--mpileup-argument", "-R", sep="=")
        # Reference sequence (FASTA)
        params.add_option("--mpileup-argument",
                          "-f=%s" % (reference,), sep="=")

        params.set_kwargs(IN_BAMFILE=infile_bam,
                          TEMP_IN_INTERVALS="heterozygous_snps.bed",
                          # Automatically remove this file
                          TEMP_OUT_INTERVALS="heterozygous_snps.bed",
                          OUT_PILEUP=outfile,
                          CHECK_SAMTOOLS=SAMTOOLS_VERSION)

        return {"command": params}
Ejemplo n.º 26
0
    def __init__(self, config, target_name, input_files, output_file, regions_file=None, dependencies=()):
        bam_input = MultiBAMInput(config, input_files)
        if len(bam_input.files) > 1 and regions_file:
            raise ValueError("DepthHistogram for regions require single, " "indexed input BAM file.")

        builder = factory.new("depths")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(OUT_FILE=output_file)
        bam_input.setup(builder)

        if regions_file:
            builder.set_option("--regions-file", "%(IN_REGIONS)s")
            builder.set_kwargs(IN_REGIONS=regions_file)

        command = ParallelCmds(bam_input.commands + [builder.finalize()])
        description = "<DepthHistogram: %s -> '%s'>" % (describe_files(bam_input.files), output_file)
        MultiBAMInputNode.__init__(
            self, bam_input=bam_input, command=command, description=description, dependencies=dependencies
        )
Ejemplo n.º 27
0
    def __init__(self, config, input_bams, output_bam, keep_dupes=True,
                 dependencies=()):
        bam_input = MultiBAMInput(config, input_bams)

        builder = factory.new("rmdup_collapsed")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.set_kwargs(OUT_STDOUT=output_bam)
        bam_input.setup(builder)

        if not keep_dupes:
            builder.set_option("--remove-duplicates")

        filteruniq = builder.finalize()
        command = ParallelCmds(bam_input.commands + [filteruniq])
        description = "<FilterCollapsedBAM: %s>" \
            % (describe_files(bam_input.files),)
        MultiBAMInputNode.__init__(self,
                                   bam_input=bam_input,
                                   command=command,
                                   description=description,
                                   dependencies=dependencies)
Ejemplo n.º 28
0
    def customize(cls, reference, infile_bam, infile_vcf, outfile, dependencies=()):
        params = factory.new("genotype")
        params.add_value("%(IN_BAMFILE)s")
        params.add_value("%(OUT_PILEUP)s")
        params.set_option("--bedfile", "%(TEMP_IN_INTERVALS)s")
        params.set_option("--pileup-only")
        # Ignore read-groups for pileup
        params.add_option("--mpileup-argument", "-R", sep="=")
        # Reference sequence (FASTA)
        params.add_option("--mpileup-argument", "-f=%s" % (reference,), sep="=")

        params.set_kwargs(
            IN_BAMFILE=infile_bam,
            TEMP_IN_INTERVALS="heterozygous_snps.bed",
            # Automatically remove this file
            TEMP_OUT_INTERVALS="heterozygous_snps.bed",
            OUT_PILEUP=outfile,
            CHECK_SAMTOOLS=SAMTOOLS_VERSION,
        )

        return {"command": params}
Ejemplo n.º 29
0
def _build_cat_command(input_file, output_file):
    cat = factory.new("cat")
    cat.set_option("--output", "%(TEMP_OUT_CAT)s")
    cat.add_value("%(IN_ARCHIVE)s")
    cat.set_kwargs(TEMP_OUT_CAT=output_file, IN_ARCHIVE=input_file)
    return cat