def constructor(self):

        self.input("bam", BamBai)
        self.input("reference", FastaWithDict)

        # optional
        self.input("intervals", BedTabix(optional=True))
        self.input("is_exome", Boolean(optional=True))
        self.input("manta_config", File(optional=True))
        self.input("strelka_config", File(optional=True))

        self.step(
            "manta",
            Manta_1_5_0(
                bam=self.bam,
                reference=self.reference,
                callRegions=self.intervals,
                exome=self.is_exome,
                config=self.manta_config,
            ),
        )

        self.step(
            "strelka",
            StrelkaGermline_2_9_10(
                bam=self.bam,
                reference=self.reference,
                callRegions=self.intervals,
                exome=self.is_exome,
                config=self.strelka_config,
            ),
        )

        # normalise and filter "PASS" variants
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(
                vcf=self.strelka.variants.as_type(CompressedVcf),
                reference=self.reference,
            ),
        )

        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("sv", source=self.manta.diploidSV)
        self.output("variants", source=self.strelka.variants)
        self.output("out", source=self.filterpass.out)
    def constructor(self):

        self.input("normalBam", CramCrai)
        self.input("tumorBam", CramCrai)

        self.input("reference", FastaFai)
        self.input("callRegions", BedTabix(optional=True))
        self.input("exome", Boolean(optional=True), default=False)
        self.input("configStrelka", File(optional=True))

        self.step(
            "manta",
            Manta(
                bam=self.normalBam,
                tumorBam=self.tumorBam,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
            ),
        )
        self.step(
            "strelka",
            Strelka(
                indelCandidates=self.manta.candidateSmallIndels,
                normalBam=self.normalBam,
                tumorBam=self.tumorBam,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
                config=self.configStrelka,
            ),
        )
        self.step(
            "normaliseSNVs",
            BcfToolsNorm(vcf=self.strelka.snvs, reference=self.reference),
        )
        self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out))

        self.step(
            "normaliseINDELs",
            BcfToolsNorm(vcf=self.strelka.indels, reference=self.reference),
        )
        self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out))

        self.output("diploid", source=self.manta.diploidSV)
        self.output("candIndels", source=self.manta.candidateSmallIndels)
        self.output("indels", source=self.indexINDELs.out)
        self.output("snvs", source=self.indexSNVs.out)
        self.output("somaticSVs", source=self.manta.somaticSVs)
    def constructor(self):

        self.input("normalBam", self.getStrelka2InputType())
        self.input("tumorBam", self.getStrelka2InputType())

        self.input("reference", FastaFai)
        self.input("callRegions", BedTabix(optional=True))
        self.input("exome", Boolean(optional=True), default=False)
        self.input("configStrelka", File(optional=True))

        self.input("indelCandidates", Array(VcfTabix))
        self.input("strelkaSNVs", Array(VcfTabix))
        # self.input("strelkaIndels", Array(VcfTabix))

        self.step(
            "strelka2pass",
            self.getStrelka2Tool()(
                indelCandidates=self.indelCandidates,
                # indelCandidates=self.strelkaIndels,
                forcedgt=self.strelkaSNVs,
                normalBam=self.normalBam,
                tumorBam=self.tumorBam,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
                config=self.configStrelka,
            ),
        )
        self.step(
            "normaliseSNVs",
            BcfToolsNorm(vcf=self.strelka2pass.snvs, reference=self.reference),
        )
        self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out))

        self.step(
            "normaliseINDELs",
            BcfToolsNorm(vcf=self.strelka2pass.indels,
                         reference=self.reference),
        )
        self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out))

        self.output("indels", source=self.indexINDELs.out)
        self.output("snvs", source=self.indexSNVs.out)
Example #4
0
    def constructor(self):

        self.input("bam", BamBai)
        self.input("reference", FastaWithDict)
        self.input("intervals", BedTabix(optional=True))
        self.input("is_exome", Boolean(optional=True))

        self.step(
            "manta",
            Manta_1_5_0(
                bam=self.bam,
                reference=self.reference,
                callRegions=self.intervals,
                exome=self.is_exome,
            ),
        )

        self.step(
            "strelka",
            StrelkaGermline_2_9_10(
                bam=self.bam,
                reference=self.reference,
                indelCandidates=self.manta.candidateSmallIndels,
                callRegions=self.intervals,
                exome=self.is_exome,
            ),
        )

        self.step(
            "bcfview",
            BcfToolsView_1_5(file=self.strelka.variants,
                             applyFilters=["PASS"]),
        )

        self.step(
            "split_multi_allele",
            SplitMultiAllele(vcf=self.bcfview.out, reference=self.reference),
        )

        self.output("diploid", source=self.manta.diploidSV)
        self.output("variants", source=self.strelka.variants)
        self.output("out", source=self.split_multi_allele.out)
Example #5
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "bam",
             BamBai(),
             prefix="--bam",
             position=1,
             shell_quote=False,
             doc=
             "Sample BAM or CRAM file. May be specified more than once, multiple inputs will be treated "
             "as each BAM file representing a different sample. [required] (no default)",
         ),
         ToolInput(
             "reference",
             FastaWithDict(),
             prefix="--referenceFasta",
             position=1,
             shell_quote=False,
             doc="samtools-indexed reference fasta file [required]",
         ),
         ToolInput(
             "relativeStrelkaDirectory",
             String(optional=True),
             default="strelka_dir",
             prefix="--runDir",
             position=1,
             shell_quote=False,
             doc=
             "Name of directory to be created where all workflow scripts and output will be written. "
             "Each analysis requires a separate directory.",
         ),
         ToolInput(
             "ploidy",
             VcfTabix(optional=True),
             prefix="--ploidy",
             position=1,
             shell_quote=False,
             doc=
             "Provide ploidy file in VCF. The VCF should include one sample column per input sample "
             "labeled with the same sample names found in the input BAM/CRAM RG header sections. "
             "Ploidy should be provided in records using the FORMAT/CN field, which are interpreted "
             "to span the range [POS+1, INFO/END]. Any CN value besides 1 or 0 will be treated as 2. "
             "File must be tabix indexed. (no default)",
         ),
         ToolInput(
             "noCompress",
             VcfTabix(optional=True),
             prefix="--noCompress",
             position=1,
             shell_quote=False,
             doc=
             "Provide BED file of regions where gVCF block compression is not allowed. "
             "File must be bgzip- compressed/tabix-indexed. (no default)",
         ),
         ToolInput(
             "callContinuousVf",
             String(optional=True),
             prefix="--callContinuousVf",
             doc="Call variants on CHROM without a ploidy prior assumption, "
             "issuing calls with continuous variant frequencies (no default)",
         ),
         ToolInput(
             "rna",
             Boolean(optional=True),
             prefix="--rna",
             position=1,
             shell_quote=False,
             doc="Set options for RNA-Seq input.",
         ),
         ToolInput(
             "indelCandidates",
             VcfTabix(optional=True),
             prefix="--indelCandidates",
             position=1,
             shell_quote=False,
             doc=
             "Specify a VCF of candidate indel alleles. These alleles are always evaluated but only "
             "reported in the output when they are inferred to exist in the sample. "
             "The VCF must be tabix indexed. All indel alleles must be left-shifted/normalized, "
             "any unnormalized alleles will be ignored. This option may be specified more than once, "
             "multiple input VCFs will be merged. (default: None)",
         ),
         ToolInput(
             "forcedGT",
             VcfTabix(optional=True),
             prefix="--forcedGT",
             position=1,
             shell_quote=False,
             doc=
             "Specify a VCF of candidate alleles. These alleles are always evaluated and reported even "
             "if they are unlikely to exist in the sample. The VCF must be tabix indexed. "
             "All indel alleles must be left- shifted/normalized, any unnormalized allele will "
             "trigger a runtime error. This option may be specified more than once, multiple input "
             "VCFs will be merged. Note that for any SNVs provided in the VCF, the SNV site will "
             "be reported (and for gVCF, excluded from block compression), "
             "but the specific SNV alleles are ignored. (default: None)",
         ),
         ToolInput(
             "exome",
             Boolean(optional=True),
             prefix="--exome",
             position=1,
             shell_quote=False,
             doc=
             "Set options for exome note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             "targeted",
             Boolean(optional=True),
             prefix="--exome",
             position=1,
             shell_quote=False,
             doc="Set options for other targeted input: "
             "note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="callRegions",
             input_type=BedTabix(optional=True),
             prefix="--callRegions=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
             "regions to call. No VCF output will be provided outside of these regions. "
             "The full genome will still be used to estimate statistics from the input "
             "(such as expected depth per chromosome). Only one BED file may be specified. "
             "(default: call the entire genome)",
         ),
         # ToolInput("version", Boolean(optional=True), prefix="--version", position=3, shell_quote=False,
         #           doc="show program's version number and exit"),
         # ToolInput("help", Boolean(optional=True), prefix="--help", position=3, shell_quote=False,
         #           doc="(-h) show this help message and exit"),
         ToolInput(
             "mode",
             String(optional=True),
             default="local",
             prefix="--mode",
             position=3,
             shell_quote=False,
             doc="(-m MODE)  select run mode (local|sge)",
         ),
         ToolInput(
             "queue",
             String(optional=True),
             prefix="--queue",
             position=3,
             shell_quote=False,
             doc="(-q QUEUE) specify scheduler queue name",
         ),
         ToolInput(
             "memGb",
             String(optional=True),
             prefix="--memGb",
             position=3,
             shell_quote=False,
             doc=" (-g MEMGB) gigabytes of memory available to run workflow "
             "-- only meaningful in local mode, must be an integer (default: Estimate the total "
             "memory for this node for local mode, 'unlimited' for sge mode)",
         ),
         # ToolInput("dryRun", Boolean(optional=True), prefix="--dryRun", position=3, shell_quote=False,
         #           doc="dryRun (-d,) workflow code without actually running command-tasks"),
         ToolInput(
             "quiet",
             Boolean(optional=True),
             prefix="--quiet",
             position=3,
             shell_quote=False,
             doc="Don't write any log output to stderr "
             "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
         ),
         ToolInput(
             "mailTo",
             String(optional=True),
             prefix="--mailTo",
             position=3,
             shell_quote=False,
             doc=
             "(-e) send email notification of job completion status to this address "
             "(may be provided multiple times for more than one email address)",
         ),
     ]
Example #6
0
    def constructor(self):
        self.input("normal", BamBai)
        self.input("tumor", BamBai)

        self.input("normal_name", String(), value="NA24385_normal")
        self.input("tumor_name", String(), value="NA24385_tumour")

        self.input("gridss_blacklist", Bed)

        self.input("gatk_intervals", Array(Bed))
        self.input("vardict_intervals", Array(Bed))
        self.input("strelka_intervals", BedTabix(optional=True))

        self.input("vardict_header_lines", File)
        self.input("allele_freq_threshold", Float, default=0.05)

        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        self.step(
            "vc_gatk",
            GatkSomaticVariantCaller_4_1_3(
                normal_bam=self.tumor,
                tumor_bam=self.normal,
                normal_name=self.normal_name,
                tumor_name=self.tumor_name,
                intervals=self.gatk_intervals,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
            ),
            scatter="intervals",
        )

        self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk))

        self.step(
            "vc_strelka",
            IlluminaSomaticVariantCaller(
                normal_bam=self.normal,
                tumor_bam=self.tumor,
                intervals=self.strelka_intervals,
                reference=self.reference,
            ),
        )

        self.step(
            "vc_gridss",
            Gridss_2_6_3(
                bams=[self.normal, self.tumor],
                reference=self.reference,
                blacklist=self.gridss_blacklist,
            ),
        )

        self.step(
            "vc_vardict",
            VardictSomaticVariantCaller(
                normal_bam=self.tumor,
                tumor_bam=self.normal,
                normal_name=self.normal_name,
                tumor_name=self.tumor_name,
                header_lines=self.vardict_header_lines,
                intervals=self.vardict_intervals,
                reference=self.reference,
                allele_freq_threshold=self.allele_freq_threshold,
            ),
            scatter="intervals",
        )

        self.step("vc_vardict_merge",
                  Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out))

        self.step(
            "combine_variants",
            CombineVariants_0_0_4(
                normal=self.normal_name,
                tumor=self.tumor_name,
                vcfs=[
                    self.vc_gatk_merge.out,
                    self.vc_strelka.out,
                    self.vc_vardict_merge.out,
                ],
                type="somatic",
                columns=["AD", "DP", "GT"],
            ),
        )
        self.step("sortCombined",
                  BcfToolsSort_1_9(vcf=self.combine_variants.vcf))

        # Outputs

        self.output("gridss_assembly",
                    source=self.vc_gridss.out,
                    output_folder="bams")

        self.output("variants_gatk",
                    source=self.vc_gatk_merge.out,
                    output_folder="variants")
        self.output("variants_strelka",
                    source=self.vc_strelka.out,
                    output_folder="variants")
        self.output(
            "variants_vardict",
            source=self.vc_vardict_merge.out,
            output_folder="variants",
        )
        self.output("variants_gridss",
                    source=self.vc_gridss.out,
                    output_folder="variants")
        self.output(
            "variants_combined",
            source=self.combine_variants.vcf,
            output_folder="variants",
        )
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("reference", FastaWithDict)

        # optional
        self.input("intervals", BedTabix(optional=True))
        self.input("is_exome", Boolean(optional=True))
        self.input("manta_config", File(optional=True))
        self.input("strelka_config", File(optional=True))

        self.step(
            "manta",
            Manta_1_5_0(
                bam=self.normal_bam,
                tumorBam=self.tumor_bam,
                reference=self.reference,
                callRegions=self.intervals,
                exome=self.is_exome,
                config=self.manta_config,
            ),
        )
        self.step(
            "strelka",
            StrelkaSomatic_2_9_10(
                indelCandidates=self.manta.candidateSmallIndels,
                normalBam=self.normal_bam,
                tumorBam=self.tumor_bam,
                reference=self.reference,
                callRegions=self.intervals,
                exome=self.is_exome,
                config=self.strelka_config,
            ),
        )
        self.step(
            "concatvcf",
            ConcatStrelkaSomaticVcf(
                headerVcfs=[self.strelka.snvs, self.strelka.indels],
                contentVcfs=[self.strelka.snvs, self.strelka.indels],
            ),
        )
        self.step("sortvcf", BcfToolsSort_1_9(vcf=self.concatvcf.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.sortvcf.out, reference=self.reference),
        )
        self.step(
            "extractaddp",
            ExtractStrelkaSomaticADDP_0_1_1(vcf=self.splitnormalisevcf.out),
        )

        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.extractaddp.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("tumor_sv", source=self.manta.somaticSV)
        self.output("normal_sv", source=self.manta.diploidSV)
        self.output("variants", source=self.sortvcf.out)
        self.output("out", source=self.filterpass.out)
Example #8
0
    def constructor(self):

        self.input("normalBam", CramCrai)
        self.input("tumorBams", Array(CramCrai))

        self.input("reference", FastaFai)

        self.input("configStrelka", File(optional=True))
        self.input("callRegions", BedTabix(optional=True))
        self.input("exome", Boolean(optional=True), default=False)

        self.input("sampleNames", Array(String, optional=True))
        self.input("minAD", Int(optional=True), default=2)

        self.step(
            "step1",
            Strelka2PassWorkflowStep1(
                normalBam=self.normalBam,
                tumorBam=self.tumorBams,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
                configStrelka=self.configStrelka,
            ),
            scatter="tumorBam",
        )

        self.step(
            "step2",
            Strelka2PassWorkflowStep2(
                normalBam=self.normalBam,
                tumorBam=self.tumorBams,
                reference=self.reference,
                callRegions=self.callRegions,
                strelkaSNVs=self.step1.snvs,
                indelCandidates=self.step1.candIndels,
                # as soon as janis allows flattening of arguments, we need this
                # indelCandidates=self.step1.indels,
                exome=self.exome,
                configStrelka=self.configStrelka,
            ),
            scatter="tumorBam",
        )

        self.step(
            "refilterSNVs",
            RefilterStrelka2Calls(
                inputFiles=self.step2.snvs,
                sampleNames=self.sampleNames,
                minAD=self.minAD,
            ),
        )
        self.step("compressSNVs",
                  BGZip(file=self.refilterSNVs.out),
                  scatter="file")
        self.step("indexSNVs", Tabix(inp=self.compressSNVs.out), scatter="inp")

        self.step(
            "refilterINDELs",
            RefilterStrelka2Calls(
                inputFiles=self.step2.indels,
                sampleNames=self.sampleNames,
                minAD=self.minAD,
            ),
        )
        self.step("compressINDELs",
                  BGZip(file=self.refilterINDELs.out),
                  scatter="file")
        self.step("indexINDELs",
                  Tabix(inp=self.compressINDELs.out),
                  scatter="inp")

        self.output(
            "snvs",
            Array(VcfTabix),
            source=self.indexSNVs,
            output_folder=self.sampleNames,
        )
        self.output(
            "indels",
            Array(VcfTabix),
            source=self.indexINDELs,
            output_folder=self.sampleNames,
        )

        # optional output from manta, but we know it will be created
        self.output("svs",
                    source=self.step1.somaticSVs,
                    output_folder=self.sampleNames)
Example #9
0
 def inputs(self):
     return [
         # ToolInput(tag="version", input_type=Boolean(), prefix="--version", separate_value_from_prefix=True,
         #           doc="show program's version number and exit"),
         # ToolInput(tag="help", input_type=Boolean(), prefix="--help", separate_value_from_prefix=True,
         #           doc="(-h) show this help message and exit"),
         # ToolInput(tag="allhelp", input_type=Boolean(), prefix="--allHelp", separate_value_from_prefix=True,
         #           doc="show all extended/hidden options"),
         ToolInput(
             tag="normalBam",
             input_type=BamBai(),
             prefix="--normalBam=",
             separate_value_from_prefix=False,
             position=1,
             doc="Normal sample BAM or CRAM file. (no default)",
         ),
         ToolInput(
             tag="tumorBam",
             input_type=BamBai(),
             prefix="--tumourBam=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "(--tumorBam)  Tumor sample BAM or CRAM file. [required] (no default)",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaWithDict(),
             prefix="--referenceFasta=",
             position=1,
             separate_value_from_prefix=False,
             doc=" samtools-indexed reference fasta file [required]",
         ),
         ToolInput(
             tag="rundir",
             input_type=Filename(),
             prefix="--runDir=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Name of directory to be created where all workflow scripts and output will be written. "
             "Each analysis requires a separate directory. (default: StrelkaSomaticWorkflow)",
         ),
         ToolInput(
             tag="region",
             input_type=Array(String, optional=True),
             prefix="--region",
             prefix_applies_to_all_elements=True,
             position=1,
             doc=
             "Limit the analysis to one or more genome region(s) for debugging purposes. If this argument "
             "is provided multiple times the union of all specified regions will be analyzed. All regions "
             "must be non-overlapping to get a meaningful result. Examples: '--region chr20' "
             "(whole chromosome), '--region chr2:100-2000 --region chr3:2500-3000' (two regions)'. "
             "If this option is specified (one or more times) together with the 'callRegions' BED file,"
             "then all region arguments will be intersected with the callRegions BED track.",
         ),
         ToolInput(
             tag="config",
             input_type=File(optional=True),
             prefix="--config=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "provide a configuration file to override defaults in global config file "
             "(/opt/strelka/bin/configureStrelkaSomaticWorkflow.py.ini)",
         ),
         ToolInput(
             tag="outputcallableregions",
             input_type=Boolean(optional=True),
             prefix="--outputCallableRegions",
             position=1,
             separate_value_from_prefix=True,
             doc=
             "Output a bed file describing somatic callable regions of the genome",
         ),
         ToolInput(
             tag="indelCandidates",
             input_type=Array(VcfTabix, optional=True),
             prefix="--indelCandidates=",
             prefix_applies_to_all_elements=True,
             position=1,
             separate_value_from_prefix=False,
             doc=
             "Specify a VCF of candidate indel alleles. These alleles are always evaluated "
             "but only reported in the output when they are inferred to exist in the sample. "
             "The VCF must be tabix indexed. All indel alleles must be left-shifted/normalized, "
             "any unnormalized alleles will be ignored. This option may be specified more than once, "
             "multiple input VCFs will be merged. (default: None)",
         ),
         ToolInput(
             tag="forcedgt",
             input_type=Array(VcfTabix, optional=True),
             prefix="--forcedGT=",
             separate_value_from_prefix=False,
             prefix_applies_to_all_elements=True,
             position=1,
             doc=
             "Specify a VCF of candidate alleles. These alleles are always evaluated and reported even "
             "if they are unlikely to exist in the sample. The VCF must be tabix indexed. All indel "
             "alleles must be left- shifted/normalized, any unnormalized allele will trigger a runtime "
             "error. This option may be specified more than once, multiple input VCFs will be merged. "
             "Note that for any SNVs provided in the VCF, the SNV site will be reported (and for gVCF, "
             "excluded from block compression), but the specific SNV alleles are ignored. (default: None)",
         ),
         ToolInput(
             tag="targeted",
             input_type=Boolean(optional=True),
             prefix="--targeted",
             separate_value_from_prefix=True,
             position=1,
             doc="Set options for other targeted input: "
             "note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="exome",
             input_type=Boolean(optional=True),
             prefix="--exome",
             separate_value_from_prefix=True,
             position=1,
             doc=
             "Set options for exome: note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="callRegions",
             input_type=BedTabix(optional=True),
             prefix="--callRegions=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
             "regions to call. No VCF output will be provided outside of these regions. "
             "The full genome will still be used to estimate statistics from the input "
             "(such as expected depth per chromosome). Only one BED file may be specified. "
             "(default: call the entire genome)",
         ),
         ToolInput(
             tag="noisevcf",
             input_type=VcfTabix(optional=True),
             prefix="--noiseVcf=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Noise vcf file (submit argument multiple times for more than one file)",
         ),
         ToolInput(
             tag="scansizemb",
             input_type=Int(optional=True),
             prefix="--scanSizeMb=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Maximum sequence region size (in megabases) scanned by each "
             "task during genome variant calling. (default: 12)",
         ),
         ToolInput(
             tag="callmemmb",
             input_type=Int(optional=True),
             prefix="--callMemMb=",
             position=1,
             separate_value_from_prefix=False,
             doc=
             "Set variant calling task memory limit (in megabytes). It is not recommended to change the "
             "default in most cases, but this might be required for a sample of unusual depth.",
         ),
         ToolInput(
             tag="retaintempfiles",
             input_type=Boolean(optional=True),
             default=False,
             position=1,
             prefix="--retainTempFiles",
             separate_value_from_prefix=True,
             doc="Keep all temporary files (for workflow debugging)",
         ),
         ToolInput(
             tag="disableevs",
             input_type=Boolean(optional=True),
             prefix="--disableEVS",
             position=1,
             separate_value_from_prefix=True,
             doc="Disable empirical variant scoring (EVS).",
         ),
         ToolInput(
             tag="reportevsfeatures",
             input_type=Boolean(optional=True),
             prefix="--reportEVSFeatures",
             position=1,
             separate_value_from_prefix=True,
             doc=
             " Report all empirical variant scoring features in VCF output.",
         ),
         ToolInput(
             tag="snvscoringmodelfile",
             input_type=File(optional=True),
             prefix="--snvScoringModelFile=",
             position=1,
             separate_value_from_prefix=False,
             doc=" Provide a custom empirical scoring model file for SNVs "
             "(default: /opt/strelka/share/config/somaticSNVScoringM odels.json)",
         ),
         ToolInput(
             tag="indelscoringmodelfile",
             input_type=File(optional=True),
             prefix="--indelScoringModelFile=",
             position=1,
             separate_value_from_prefix=False,
             doc=" Provide a custom empirical scoring model file for indels "
             "(default: /opt/strelka/share/config/somaticInde lScoringModels.json)",
         ),
         ToolInput(
             "mode",
             String(optional=True),
             default="local",
             prefix="--mode",
             position=3,
             shell_quote=False,
             doc="(-m MODE)  select run mode (local|sge)",
         ),
         ToolInput(
             "queue",
             String(optional=True),
             prefix="--queue",
             position=3,
             shell_quote=False,
             doc="(-q QUEUE) specify scheduler queue name",
         ),
         ToolInput(
             "memGb",
             String(optional=True),
             prefix="--memGb",
             position=3,
             shell_quote=False,
             doc=" (-g MEMGB) gigabytes of memory available to run workflow "
             "-- only meaningful in local mode, must be an integer (default: Estimate the total "
             "memory for this node for local mode, 'unlimited' for sge mode)",
         ),
         ToolInput(
             "quiet",
             Boolean(optional=True),
             prefix="--quiet",
             position=3,
             shell_quote=False,
             doc="Don't write any log output to stderr "
             "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
         ),
         # ToolInput("mailTo", String(optional=True), prefix="--mailTo", position=3, shell_quote=False,
         #           doc="(-e) send email notification of job completion status to this address "
         #               "(may be provided multiple times for more than one email address)"),
     ]
Example #10
0
class MantaBase(IlluminaToolBase, ABC):
    def tool(self):
        return "manta"

    def base_command(self):
        return None

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def inputs(self) -> List[ToolInput]:
        return [*self.config_inputs, *self.running_inputs]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("python",
                       File(),
                       glob=InputSelector("runDir") + "/runWorkflow.py"),
            ToolOutput(
                "pickle",
                File(),
                glob=InputSelector("runDir") + "/runWorkflow.py.config.pickle",
            ),
            ToolOutput(
                "candidateSV",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/candidateSV.vcf.gz",
            ),
            ToolOutput(
                "candidateSmallIndels",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/candidateSmallIndels.vcf.gz",
            ),
            ToolOutput(
                "diploidSV",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/diploidSV.vcf.gz",
            ),
            ToolOutput(
                "alignmentStatsSummary",
                File(),
                glob=InputSelector("runDir") +
                "/results/stats/alignmentStatsSummary.txt",
            ),
            ToolOutput(
                "svCandidateGenerationStats",
                Tsv(),
                glob=InputSelector("runDir") +
                "/results/stats/svCandidateGenerationStats.tsv",
            ),
            ToolOutput(
                "svLocusGraphStats",
                Tsv(),
                glob=InputSelector("runDir") +
                "/results/stats/svLocusGraphStats.tsv",
            ),
        ]

    def arguments(self) -> List[ToolArgument]:
        return [
            ToolArgument("configManta.py", position=0, shell_quote=False),
            ToolArgument(
                StringFormatter(";") + InputSelector("runDir") +
                "/runWorkflow.py",
                position=2,
                shell_quote=False,
            ),
            ToolArgument(
                CpuSelector(None),
                position=3,
                shell_quote=False,
                prefix="-j",
                doc="(-j) number of jobs, must be an integer or 'unlimited' "
                "(default: Estimate total cores on this node for local mode, 128 for sge mode)",
            ),
        ]

    @abstractmethod
    def container(self):
        raise Exception("Strelka version must override docker command")

    def friendly_name(self):
        return "Manta"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2019, 2, 12),
            dateUpdated=date(2019, 2, 19),
            institution="Illumina",
            doi=" doi:10.1093/bioinformatics/btv710",
            citation=
            "Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and "
            "cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710",
            keywords=["illumina", "manta", "variant caller"],
            documentationUrl="https://github.com/Illumina/manta",
            documentation="""
Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. 
It is optimized for analysis of germline variation in small sets of individuals and somatic 
variation in tumor/normal sample pairs. Manta discovers, assembles and scores large-scale SVs, 
medium-sized indels and large insertions within a single efficient workflow. The method is 
designed for rapid analysis on standard compute hardware: NA12878 at 50x genomic coverage is 
analyzed in less than 20 minutes on a 20 core server, and most WGS tumor/normal analyses 
can be completed within 2 hours. Manta combines paired and split-read evidence during SV 
discovery and scoring to improve accuracy, but does not require split-reads or successful 
breakpoint assemblies to report a variant in cases where there is strong evidence otherwise. 

It provides scoring models for germline variants in small sets of diploid samples and somatic 
variants in matched tumor/normal sample pairs. There is experimental support for analysis of 
unmatched tumor samples as well. Manta accepts input read mappings from BAM or CRAM files and 
reports all SV and indel inferences in VCF 4.1 format. See the user guide for a full description 
of capabilities and limitations.""".strip(),
        )

    config_inputs = [
        ToolInput(
            "config",
            File(optional=True),
            prefix="--config",
            position=1,
            shell_quote=False,
            doc=
            "provide a configuration file to override defaults in global config file "
            "(/opt/conda/share/manta-1.2.1-0/bin/configManta.py.ini)",
        ),
        ToolInput(
            "bam",
            BamBai(),
            prefix="--bam",
            position=1,
            shell_quote=False,
            doc=
            "FILE Normal sample BAM or CRAM file. May be specified more than once, multiple inputs "
            "will be treated as each BAM file representing a different sample. [optional] (no default)",
        ),
        ToolInput(
            "runDir",
            Filename(),
            prefix="--runDir",
            position=1,
            shell_quote=False,
            doc=
            "Run script and run output will be written to this directory [required] "
            "(default: MantaWorkflow)",
        ),
        ToolInput(
            "reference",
            FastaWithDict(),
            prefix="--referenceFasta",
            position=1,
            shell_quote=False,
            doc="samtools-indexed reference fasta file [required]",
        ),
        ToolInput(
            "tumorBam",
            BamBai(optional=True),
            prefix="--tumorBam",
            position=1,
            shell_quote=False,
            doc=
            "Tumor sample BAM or CRAM file. Only up to one tumor bam file accepted. [optional=null]",
        ),
        ToolInput(
            "exome",
            Boolean(optional=True),
            prefix="--exome",
            position=1,
            shell_quote=False,
            doc="Set options for WES input: turn off depth filters",
        ),
        ToolInput(
            "rna",
            Bam(optional=True),
            prefix="--rna",
            position=1,
            shell_quote=False,
            doc=
            "Set options for RNA-Seq input. Must specify exactly one bam input file",
        ),
        ToolInput(
            "unstrandedRNA",
            File(optional=True),
            prefix="--unstrandedRNA",
            position=1,
            shell_quote=False,
            doc=
            "Set if RNA-Seq input is unstranded: Allows splice-junctions on either strand",
        ),
        ToolInput(
            "outputContig",
            File(optional=True),
            prefix="--outputContig",
            position=1,
            shell_quote=False,
            doc="Output assembled contig sequences in VCF file",
        ),
        ToolInput(
            "callRegions",
            BedTabix(optional=True),
            prefix="--callRegions",
            position=1,
            shell_quote=False,
            doc=
            "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
            "regions to call. No VCF output will be provided outside of these regions. The full "
            "genome will still be used to estimate statistics from the input (such as expected depth "
            "per chromosome). Only one BED file may be specified. (default: call the entire genome)",
        ),
    ]

    running_inputs = [
        ToolInput(
            "mode",
            String(optional=True),
            default="local",
            prefix="--mode",
            position=3,
            shell_quote=False,
            doc="(-m) select run mode (local|sge)",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--quiet",
            position=3,
            shell_quote=False,
            doc="Don't write any log output to stderr "
            "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
        ),
        ToolInput(
            "queue",
            String(optional=True),
            prefix="--queue",
            position=3,
            shell_quote=False,
            doc="(-q) specify scheduler queue name",
        ),
        ToolInput(
            "memgb",
            Int(optional=True),
            prefix="--memGb",
            position=3,
            shell_quote=False,
            doc=
            "(-g) gigabytes of memory available to run workflow -- only meaningful in local mode, "
            "must be an integer (default: Estimate the total memory for this node for local  mode, "
            "'unlimited' for sge mode)",
        ),
        # ToolInput("dryRun", Boolean(optional=True), prefix="--dryRun", position=3, shell_quote=False,
        #           doc="(-d) dryRun workflow code without actually running command - tasks"),
        ToolInput(
            "maxTaskRuntime",
            String(optional=True),
            prefix="--maxTaskRuntime",
            position=3,
            shell_quote=False,
            doc=
            "(format: hh:mm:ss) Specify scheduler max runtime per task, argument is "
            "provided to the 'h_rt' resource limit if using SGE (no default)",
        ),
    ]