Ejemplo n.º 1
0
    def run(self, infiles, outfile, params):

        files = " ".join(infiles)

        job_threads = params.job_threads

        # todo:
        # 1. add header.
        # 2. do batch+merge sort in order to avoid hitting temporary space limits.
        # 3. remove unnecessary info fields while sorting, add them later.

        tmpdir = P.get_temp_filename()
        retval = P.run(
            "mkdir {tmpdir}; "
            "bcftools view -h {infiles[0]} "
            "| cut -f 1-10 "
            "| bgzip > {outfile}; "
            "zcat {files} "
            "| awk -v OFS='\\t' "
            "'!/^#/ && $5 != \"<NON_REF>\" "
            "{{$8=\".\";$9=\".\";$6=\".\";$7=\"GT\";$10=\".\"; print}}' "
            "2> {outfile}.filter.log "
            "| sort -k1,1V -k2,2n "
            "--parallel {job_threads} "
            "-T {tmpdir} "
            "2> {outfile}.sort.log "
            "| uniq "
            "| bgzip "
            ">> {outfile}; "
            "tabix -p vcf {outfile}; "
            "rm -rf {tmpdir} ".format(**locals()))
Ejemplo n.º 2
0
    def run(self, infile, outfile, params):

        if params.reference_bed is None:
            raise ValueError("{} requires reference_bed to be set".format(
                self.name))

        # requires a consistent sort order, so sort both files.
        # It also requires the chromosome content to be identical,
        # so restrict output to common sets.
        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.reference_bed)

        statements = [stmnt]
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa "
                          "| bgzip "
                          "> {outfile}.shared.bed.gz")
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_test.bed.gz")
        statements.append("{params.path} intersect "
                          "-b {tmpf_test} "
                          "-a {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_truth.bed.gz")
        statements.append("rm -f {tmpf_test} {tmpf_truth}")

        for section in self.sections:
            statements.append(
                "tabix -p bed {outfile}.{section}.bed.gz".format(**locals()))

        statement = "; ".join(statements)
        retval = P.run(statement.format(**locals()))

        # these are small files, so doing it here. Implement tabix.count()
        # method
        counts = dict()
        for section in self.sections:
            # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf:
            inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz")
            counts[section] = len(list(inf.fetch()))
            inf.close()

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("section\tcounts\n")
            outf.write("\n".join(
                ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n")

        return retval
Ejemplo n.º 3
0
    def run(self, outfile, params):

        if "--threads" in params.options or "-t " in params.options:
            job_threads = int(re.search("(-t|--threads)\s*(\d+)",
                                        params.options).groups()[1])

        fastq = resolve_argument(params.fastq, ",").split(",")
        if len(fastq) == 1:
            fastq = '-U "{}"'.format(fastq)
        else:
            fastq = '-1 "{}" -2 "{}"'.format(*fastq)

        tmpdir = P.get_temp_filename(clear=True)

        if "index" in params._fields:
            index = params.index
        else:
            index = params.reference_fasta

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "--rg-id {}".format(readgroup_id)

            # add additional level of quoting and remove "ID:{}"
            readgroup_string = re.sub("@RG\tID:\S+\t", "", readgroup_string)
            readgroup_string = " ".join(["--rg {}".format(x)
                                         for x in readgroup_string.split("\t")])
        else:
            readgroup_option = ""
            readgroup_string = ""

        return P.run(
            "mkdir {tmpdir}; "
            "{self.path} "
            "{readgroup_option} "
            "{readgroup_string} "
            "{params.options} "
            "-x {index} "
            "{fastq} "
            "2> {outfile}.log "
            "| samtools view -b /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort -T {tmpdir} -O bam /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile}; "
            "rm -rf {tmpdir}".format(**locals()),
            **params._asdict())
Ejemplo n.º 4
0
    def run(self, infile, outfile, params):

        if params.annotations_bed is None:
            raise ValueError("{} requires annotations_bed to be set".format(
                self.name))

        if params.workspace_bed is None:
            raise ValueError("{} requires workspace_bed to be set".format(
                self.name))

        retval = run_metric_bedtools_intersection.run(self, infile, outfile,
                                                      params)
        retvals = [retval]

        statements = [
            "mv {outfile} {outfile}.bedtools_intersect_and_annotate_counts.tsv"
            .format(**locals())
        ]
        bed_files = []
        for section in self.sections:
            tmpf = P.get_temp_filename(clear=True) + "-" + section + ".gz"
            statements.append(
                "zcat {outfile}.{section}.bed.gz "
                "| awk -v OFS='\\t' '{{ $4 = \"{section}\"; print }}' "
                "| bgzip > {tmpf}".format(**locals()))

            bed_files.append(tmpf)

        segment_files = " ".join(
            ["--segment-bed-file={}".format(x) for x in bed_files])

        statements.append(
            "{params.gat_path} "
            "{segment_files} "
            "--with-segment-tracks "
            "--annotation-bed-file={params.annotations_bed} "
            "--workspace-bed-file={params.workspace_bed} "
            "--log={outfile} "
            "{params.options} "
            "> {outfile}.bedtools_intersect_and_annotate_enrichment.tsv".
            format(**locals()))

        for f in bed_files:
            statements.append("rm -f {}".format(f))

        statement = "; ".join(statements)
        retvals.append(P.run(statement))

        return retvals
Ejemplo n.º 5
0
    def run(self, outfile, params):

        # the default is auto so use ten threads.
        if "threads" in params.options:
            if "job_threads=auto" in params.options:
                raise ValueError(
                    "please specify the number of threads "
                    "to use explicitely")
            else:
                job_threads = int(re.search("threads=(\d+)",
                                            params.options).groups()[0])
        else:
            raise ValueError("please specify the number of threads to use")

        job_memory = "32G"

        fastq = resolve_argument(params.fastq, " ")

        tmpdir = P.get_temp_filename(clear=True)

        return P.run(
            "mkdir {tmpdir}; "
            "zcat {fastq} "
            "| cut -c -5999 "
            "| gzip > {tmpdir}/in.fastq.gz; "
            "{params.path} "
            "{params.options} "
            "in={tmpdir}/in.fastq.gz "
            "ref={params.reference_fasta} "
            "out={tmpdir}/result.bam "
            ">& {outfile}.log; "
            "samtools sort -o {tmpdir}/sorted.bam {tmpdir}/result.bam; "
            "java -Xmx8000m -jar {params.path_picard} "
            "AddOrReplaceReadGroups "
            "INPUT={tmpdir}/sorted.bam "
            "OUTPUT={outfile} "
            "VALIDATION_STRINGENCY=LENIENT "
            "RGID=1 "
            "RGLB={params.library} "
            "RGPL={params.platform} "
            "RGPU=unknown "
            "RGSM={params.sample} "
            ">& {outfile}.picard.log; "
            "samtools index {outfile} "
            ">& {outfile}.index.log; "
            "rm -rf {tmpdir}".format(**locals()))
Ejemplo n.º 6
0
    def run(self, outfile, params):

        if "-t" in params.options:
            job_threads = int(re.search("-t\s*(\d+)",
                                        params.options).groups()[0])
        else:
            job_threads = 1

        # BWA requires at least 6Gb of memory, but is also correlated
        # with the number of threads, so use 5Gb + 1Gb per thread
        job_memory = "{}G".format(5.0 + 1.0 * job_threads)

        fastq = resolve_argument(params.fastq, ",")
        fastq = '"{}"'.format('" "'.join(fastq.split(",")))

        tmpdir = P.get_temp_filename(clear=True)

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "-R {}".format(pipes.quote(readgroup_string))
            # add additional level of quoting:
            readgroup_option = re.sub("\\t", "\\\\t", readgroup_option)
        else:
            readgroup_option = ""

        return P.run(
            "mkdir {tmpdir}; "
            "{self.path} mem "
            "{readgroup_option} "
            "{params.options} "
            "{params.reference_fasta} "
            "{fastq} "
            "2> {outfile}.log "
            "| samtools view -bu /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort --threads {job_threads} -T {tmpdir} -O bam /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile} >& {outfile}.index.log; "
            "rm -rf {tmpdir}".format(**locals()),
            **params._asdict())
Ejemplo n.º 7
0
    def run(self, infile, outfile, params):

        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.annotations_bed)
        statements = [stmnt]
        statements.append("{params.path} "
                          "--segment-bed-file={tmpf_test} "
                          "--ignore-segment-tracks "
                          "--annotation-bed-file={tmpf_truth} "
                          "--workspace-bed-file={params.workspace_bed} "
                          "--log={outfile}.log "
                          "{params.options} "
                          "> {outfile}")

        statement = "; ".join(statements)
        return P.run(statement.format(**locals()))
Ejemplo n.º 8
0
    def run(self, infile, outfiles, params):

        # requires a consistent sort order, so sort both files.
        # It also requires the chromosome content to be identical,
        # so restrict output to common sets.
        tmpf = P.get_temp_filename(clear=True)

        outfile_shared, outfile_test, outfile_truth = outfiles

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.reference_bed)

        statements = [stmnt]
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa "
                          "| bgzip "
                          "> {outfile_shared} ")
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile_test}")
        statements.append("{params.path} intersect "
                          "-b {tmpf_test} "
                          "-a {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile_truth}")
        statements.append("rm -f {tmpf_test} {tmpf_truth}")

        for f in outfiles:
            statements.append("tabix -f -p bed {}".format(f))

        statement = "; ".join(statements)
        retval = P.run(statement.format(**locals()))

        return retval
    def run(self, outfile, params):

        if "--nCPU" in params.options:
            job_threads = int(
                re.search("--nCPU\s*(\d+)", params.options).groups()[0])

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)

        tmpfile = P.get_temp_filename(clear=True)

        return P.run("{params.path} callVariants "
                     "--bamFiles {bam} "
                     "--refFile {reference_fasta} "
                     "--output {tmpfile} "
                     "{params.options} "
                     ">& {outfile}.log; "
                     "bgzip {tmpfile}; "
                     "tabix -p vcf {tmpfile}.gz; "
                     "mv {tmpfile}.gz  {outfile}; "
                     "mv {tmpfile}.gz.tbi {outfile}.tbi; ".format(**locals()))
Ejemplo n.º 10
0
    def run(self, infile, outfile, params):

        if params.reference_bed is None:
            raise ValueError("{} requires reference_bed to be set".format(
                self.name))

        # jaccard requires a consistent sort order, so sort both
        # bed files:
        tmpf = P.get_temp_filename(clear=True)

        tmpf1, tmpf2 = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf1, tmpf2, infile,
                                      params.reference_bed)

        retval = P.run("{stmnt}; "
                       "{params.path} jaccard "
                       "-a {tmpf1} -b {tmpf2} "
                       "{params.options} "
                       "2> {outfile}.log "
                       ">> {outfile}; "
                       "rm -f {tmpf1} {tmpf2}".format(**locals()))

        return retval
Ejemplo n.º 11
0
    def run(self, outfile, params):

        if "-t" in params.options:
            job_threads = int(re.search("-t\s*(\d+)",
                                        params.options).groups()[0])

        job_memory = "32G"

        fastq = resolve_argument(params.fastq, " ")

        tmpdir = P.get_temp_filename(clear=True)

        return P.run(
            "mkdir {tmpdir}; "
            "{params.path} "
            "{params.options} "
            "-r {params.reference_fasta} "
            "-d {fastq} "
            "-o {tmpdir}/result.sam "
            ">& {outfile}.log; "
            "samtools view -bS {tmpdir}/result.sam "
            "| samtools sort -o {tmpdir}/sorted.bam -; "
            "java -Xmx8000m -jar {params.path_picard} "
            "AddOrReplaceReadGroups "
            "INPUT={tmpdir}/sorted.bam "
            "OUTPUT={outfile} "
            "VALIDATION_STRINGENCY=LENIENT "
            "RGID=1 "
            "RGLB={params.library} "
            "RGPL={params.platform} "
            "RGPU=unknown "
            "RGSM={params.sample} "
            ">& {outfile}.picard.log; "
            "samtools index {outfile} "
            ">& {outfile}.index.log; "
            "rm -rf {tmpdir}".format(**locals()))
Ejemplo n.º 12
0
    def run(self, outfile, params):

        try:
            vcf_target = params.vcf["target"]
            test_fp = params.vcf["test"]["fp"]
            test_fn = params.vcf["test"]["fn"]
            test_tp = params.vcf["test"]["tp"]
            comp_fp = params.vcf["compare"]["fp"]
            comp_fn = params.vcf["compare"]["fn"]
            comp_tp = params.vcf["compare"]["tp"]
        except KeyError as msg:
            raise ValueError("missing input data: {}".format(msg))

        tmpdir = P.get_temp_filename(clear=True)

        outdir = os.path.dirname(outfile)
        bedfile = os.path.join(tmpdir, "annotations.bed.gz")
        bedfile_sorted = os.path.join(outdir, "annotations.bed.gz")

        header = os.path.join(outdir, "header.txt")
        with open(header, "w") as outf:
            outf.write(
                '##INFO=<ID=AS,Number=.,Type=String,'
                'Description="Assessment code. Combination of FP/FN/TP and '
                'U for unique, O for other and S for shared.">')

        statements = ["mkdir {tmpdir}".format(**locals())]
        toprocess = []
        for a, b, label in zip((test_fp, test_fn, test_tp),
                               (comp_fp, comp_fn, comp_tp),
                               ("FP", "FN", "TP")):
            statements.append(
                "{params.path} isec "
                "--output-type z "
                "--prefix {tmpdir}/{label} "
                "{a} {b}"
                "&> {outfile}.isec_{label}.log ".format(**locals()))
            toprocess.append((os.path.join(tmpdir, label,
                                           "0000.vcf.gz"), label + "U"))
            toprocess.append((os.path.join(tmpdir, label,
                                           "0001.vcf.gz"), label + "O"))
            toprocess.append((os.path.join(tmpdir, label,
                                           "0002.vcf.gz"), label + "S"))

        # TPO = FNU
        # TPU = FNO
        toprocess = [x for x in toprocess if x[1] not in ["TPO", "TPU"]]
        # files to keep, these are variants that will be not in the vcf
        # file that is being annotated.

        keep = ["FNS", "FNU", "FNO"]
        for f, label in toprocess:
            statements.append(
                "zcat {f} "
                "| awk '!/^#/ "
                "{{printf(\"%%s\\t%%i\\t%%i\\t{label}\\n\", $1, $2-1, $2) }}'"
                "| bgzip "
                ">> {bedfile} ".format(**locals()))
            if label in keep:
                statements.append(
                    "cp {f} {outfile}.{label}.vcf.gz".format(**locals()))

        statements.append(
            "zcat {bedfile} "
            "| sort -k1,1 -k2,2n "
            "| bedtools merge -i stdin -c 4 -o distinct -delim ',' "
            "2> {bedfile_sorted}.log "
            "| bgzip "
            "> {bedfile_sorted}".format(**locals()))
        statements.append("tabix -p bed {bedfile_sorted}".format(**locals()))

        statements.append("bcftools annotate "
                          "--annotations={bedfile_sorted} "
                          "--columns=CHROM,FROM,TO,AS "
                          "--header-lines {header} "
                          "--output-type z "
                          "{vcf_target} "
                          "2> {outfile}.log "
                          "> {outfile}; "
                          "tabix -p vcf {outfile} ".format(**locals()))

        statements.append("rm -rf {tmpdir}".format(**locals()))

        statement = "; ".join(statements)

        return self.run_with_preprocessing(vcf_target, outfile, params,
                                           statement)
Ejemplo n.º 13
0
    def run(self, outfile, params):

        min_job_memory = 3
        if "-t" in params.options:
            job_threads = int(re.search("-t\s*(\d+)",
                                        params.options).groups()[0])
        else:
            job_threads = 1

        job_memory = "{}G".format(
            float(min_job_memory + 1.0 * job_threads) / job_threads)

        cram_fasta = params.cram_fasta
        if params.cram_fasta is None:
            cram_fasta = params.reference_fasta

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "-R {}".format(pipes.quote(readgroup_string))
            # add additional level of quoting:
            readgroup_option = re.sub("\\t", "\\\\t", readgroup_option)
        else:
            readgroup_option = ""

        fastq = " ".join(sra_peek(params.sra))
        outfile = os.path.abspath(outfile)

        if params.extract_to_temp:
            tmpdir = P.get_temp_filename(clear=True)
            tmpdir_pre = "mkdir {};".format(tmpdir)
            tmpdir_post = "rm -rf {}".format(tmpdir)
        else:
            tmpdir = os.path.dirname(outfile)
            tmpdir_pre = ""
            tmpdir_post = ""

        # AH: fastq-dump hangs with arv mounts, thus try copying first
        if not IOTools.is_local(params.sra):
            E.warn("copying file {} to temporary directory".format(params.sra))
            temp_sra = os.path.join(
                tmpdir, os.path.basename(params.sra))
            fastq_dump = (
                "cp {params.sra}* {tmpdir}; "
                "fastq-dump --split-files --gzip {temp_sra} >& {outfile}.dump.log ".format(
                    **locals()))
            tmpdir_post = "rm -f {}*; {}".format(
                temp_sra, tmpdir_post)
        else:
            fastq_dump = (
                "fastq-dump --split-files --gzip {params.sra} >& {outfile}.dump.log "
            )

        return P.run(
            "{tmpdir_pre} "
            "cd {tmpdir}; "
            "{fastq_dump}; "
            "{self.path} mem -v 3 "
            "{readgroup_option} "
            "{params.options} "
            "{params.reference_fasta} "
            "{fastq} "
            "2> {outfile}.map.log "
            "| samtools view -O cram --reference {params.cram_fasta} /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort -T {tmpdir} -O cram /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile} >& {outfile}.index.log; "
            "{tmpdir_post}".format(**locals()))