Esempio n. 1
0
    def tests(self) -> Optional[List[TTestCase]]:
        bioinf_base = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics"
        hg38 = f"{bioinf_base}/hg38"
        chr17 = f"{bioinf_base}/petermac_testdata"

        return [
            TTestCase(
                name="brca1",
                input={
                    "sample_name":
                    "NA12878",
                    "reference":
                    f"{chr17}/Homo_sapiens_assembly38.chr17.fasta",
                    "fastqs": [[
                        f"{chr17}/NA12878-BRCA1_R1.fastq.gz",
                        f"{chr17}/NA12878-BRCA1_R2.fastq.gz",
                    ]],
                    "gatk_intervals": [f"{chr17}/BRCA1.hg38.bed"],
                    "known_indels":
                    f"{chr17}/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz",
                    "mills_indels":
                    f"{chr17}/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz",
                    "snps_1000gp":
                    f"{chr17}/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz",
                    "snps_dbsnp":
                    f"{chr17}/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz",
                    "cutadapt_adapters":
                    f"{chr17}/contaminant_list.txt",
                },
                output=Vcf.basic_test("out_variants_bamstats", 51300, 230) +
                Vcf.basic_test("out_variants_gatk_split", 50710, 221) +
                BamBai.basic_test("out_bam", 2822000, 49600) +
                TextFile.basic_test(
                    "out_performance_summary",
                    948,
                    md5="575354942cfb8d0367725f9020181443",
                ) + Array.array_wrapper([
                    ZipFile.basic_test("out_fastqc_reports", 408000),
                    ZipFile.basic_test("out_fastqc_reports", 416000),
                ]),
            )
        ]
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("vcf", Vcf(optional=True), position=1),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".filter"),
             prefix="-o",
             position=3,
             shell_quote=False,
         ),
     ]
Esempio n. 3
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".combined"),
             prefix="-o",
         ),
         # deprecated
         # ToolInput(
         #     "regions",
         #     Filename(extension=".tsv"),
         #     prefix="--regions",
         #     doc="Region file containing all the variants, used as samtools mpileup",
         # ),
         ToolInput(
             "vcfs",
             Array(Vcf()),
             prefix="-i",
             prefix_applies_to_all_elements=True,
             doc=
             "input vcfs, the priority of the vcfs will be based on the order of the input",
         ),
         ToolInput("type",
                   String(),
                   prefix="--type",
                   doc="germline | somatic"),
         ToolInput(
             "columns",
             Array(String(), optional=True),
             prefix="--columns",
             separator=",",
             doc="Columns to keep, seperated by space output vcf (unsorted)",
         ),
         ToolInput(
             "normal",
             String(optional=True),
             prefix="--normal",
             doc=
             "Sample id of germline vcf, or normal sample id of somatic vcf",
         ),
         ToolInput(
             "tumor",
             String(optional=True),
             prefix="--tumor",
             doc="tumor sample ID, required if inputs are somatic vcfs",
         ),
         ToolInput(
             "priority",
             Int(optional=True),
             prefix="--priority",
             doc=
             "The priority of the callers, must match with the callers in the source header",
         ),
     ]
Esempio n. 4
0
 def outputs(self) -> List[ToolOutput]:
     return [
         ToolOutput("vcf",
                    Vcf(optional=True),
                    glob=WildcardSelector("*.vcf.recal")),
         ToolOutput(
             "used_options",
             File(optional=True),
             glob=WildcardSelector("VQRLogs/*.json"),
         ),
     ]
Esempio n. 5
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "vcf", Vcf(), position=0, doc="The VCF to remove the IUPAC bases from"
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".trimmed"),
             position=2,
         ),
     ]
Esempio n. 6
0
 def inputs(self):
     return [
         ToolInput("vcf", Vcf(), position=10),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf"),
             prefix="--output",
             doc="[-o] see Common Options",
         ),
         *self.additional_args,
     ]
Esempio n. 7
0
    def constructor(self):

        # TODO: work out 'target_gene_file'

        # [
        #   vep
        #   + vepfilter
        #   + report_vep_cleanup
        #   + report_vep_text,
        #   + vepvcf +
        #   vepfiltervcf
        #   , [
        #       chr_rename + liftover + oncotator_format,
        #       report_vep_vcf_cleanup + report_vep
        #   ]
        # ]
        self.input("variants", Vcf())

        self.step(
            "vep",
            VepCacheLatest(
                inputFile=self.variants,
                symbol=True,
                filterCommon=True,
                sift="b",
                polyphen="b",
                outputFilename="generated.txt",
                vcf=False,
            ),
        )

        self.step(
            "vepfilter",
            FilterVep_98_3(
                input_file=self.vep.out,
                format="tab",
                filter=StringFormatter("SYMBOL in {target_gene_file}",
                                       target_gene_file="FILE"),
            ),
        )

        self.step(
            "vepvcf",
            VepCacheLatest(
                inputFile=self.vepfilter,
                symbol=True,
                filterCommon=True,
                alleleNumber=True,
                sift="b",
                polyphen="b",
            ),
        )
Esempio n. 8
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("vcf", Vcf(), position=1, shell_quote=False),
         ToolInput(
             "reference", FastaWithDict(), prefix="-r", position=4, shell_quote=False
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".norm"),
             position=6,
             prefix="-o",
             shell_quote=False,
         ),
     ]
Esempio n. 9
0
 def outputs(self) -> List[ToolOutput]:
     return [
         ToolOutput("vcf", Vcf(), glob=WildcardSelector("*.vcf")),
         ToolOutput(
             "used_options",
             File(optional=True),
             glob=WildcardSelector("PiscesLogs/*.json"),
         ),
         ToolOutput(
             "strandmetrics",
             File(optional=True),
             glob=WildcardSelector("*ReadStrandBias.txt"),
         ),
     ]
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("vcf", Vcf(), position=1),
         ToolInput(
             "outputFilename",
             Filename(
                 prefix=InputSelector("vcf", remove_file_extension=True),
                 extension=".vcf",
                 suffix=".filter",
             ),
             prefix="-o",
             position=3,
         ),
     ]
Esempio n. 11
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("file",
                   Vcf(),
                   position=100,
                   doc="File to bgzip compress"),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf.gz"),
             position=102,
             shell_quote=False,
         ),
         *self.additional_args,
     ]
Esempio n. 12
0
 def inputs(self):
     return [
         *self.additional_inputs,
         ToolInput("inputVcf", Vcf(), prefix="-i", doc="input vcf"),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".addbamstats"),
             prefix="-o",
             doc="output vcf name",
         ),
         ToolInput(
             "type",
             String(),
             prefix="--type",
             doc="must be either germline or somatic",
         ),
     ]
Esempio n. 13
0
 def inputs(self):
     return [
         # bcf is not supported yet
         ToolInput(
             "vcf",
             Vcf(),
             prefix="--vcf",
             doc="This option defines the VCF file to be processed. VCFtools expects files in VCF format v4.0, v4.1 or v4.2. The latter two are supported  with  some  small limitations. If the user provides a dash character '-' as a file name, the program expects a VCF file to be piped in through standard in.",
         ),
         ToolInput(
             "outputFilename",
             Filename(),
             prefix="--out",
             doc='<output_prefix>. This option defines the output filename prefix for all files generated by vcftools. For example, if <prefix> is set to output_filename, then all output files will be of the form output_filename.*** . If this option is omitted, all output files will have the prefix "out." in the current working directory.',
         ),
         *self.additional_inputs,
     ]
Esempio n. 14
0
 def tests(self):
     return [
         TTestCase(
             name="basic",
             input={
                 "bam":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.markduped.bam",
                 ),
                 "reference":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.chr17.fasta",
                 ),
                 "vcf":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.sorted.uncompressed.stdout",
                 ),
                 "samtoolsmpileup_countOrphans":
                 True,
                 "samtoolsmpileup_noBAQ":
                 True,
                 "samtoolsmpileup_maxDepth":
                 10000,
                 "samtoolsmpileup_minBQ":
                 0,
                 "addbamstats_type":
                 "germline",
             },
             output=Vcf.basic_test(
                 "out",
                 69225,
                 230,
                 ["GATKCommandLine"],
                 "db09c6c37c52771bd058e32d5c6b94c1",
             ),
         )
     ]
Esempio n. 15
0
 def tests(self):
     parent_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics"
     germline_data = f"{parent_dir}/wgsgermline_data"
     somatic_data = f"{parent_dir}/wgssomatic_data"
     return [
         TTestCase(
             name="basic",
             input={
                 "normal_bam": f"{somatic_data}/NA24385-BRCA1.markduped.recalibrated.bam",
                 "tumor_bam": f"{somatic_data}/NA12878-NA24385-mixture.markduped.recalibrated.bam",
                 "reference": f"{germline_data}/Homo_sapiens_assembly38.chr17.fasta",
                 "gnomad": f"{somatic_data}/af-only-gnomad.hg38.BRCA1.vcf.gz",
                 "intervals": f"{germline_data}/BRCA1.hg38.bed",
                 "normal_name": "NA24385-BRCA1",
                 "filterpass_removeFileteredAll": True,
                 "filterpass_recode": True,
                 "filterpass_recodeINFOAll": True,
                 "output_bam_name": "mutect2.bam",
             },
             output=Vcf.basic_test(
                 "out",
                 33000,
                 147,
                 ["GATKCommandLine"],
                 "c083775bc8c49397fb65ec12cd435688",
             )
             + VcfTabix.basic_test(
                 "variants",
                 13000,
                 260,
                 182,
                 ["GATKCommandLine"],
                 "6cfd70dda8599a270978868166ab6545",
             )
             + BamBai.basic_test(
                 "out_bam",
                 813200,
                 21200,
                 f"{somatic_data}/somatic_variant_caller.flagstat",
             ),
         ),
     ]
Esempio n. 16
0
 def tests(self):
     remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data"
     return [
         TTestCase(
             name="basic",
             input={
                 "inputVcf":
                 f"{remote_dir}/NA12878-BRCA1.sorted.uncompressed.stdout",
                 "mpileup": f"{remote_dir}/NA12878-BRCA1.mpileup.stdout",
                 "type": "germline",
             },
             output=Vcf.basic_test(
                 "out",
                 69225,
                 230,
                 ["GATKCommandLine"],
                 "db09c6c37c52771bd058e32d5c6b94c1",
             ),
         )
     ]
Esempio n. 17
0
 def tests(self):
     remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data"
     return [
         TTestCase(
             name="basic",
             input={
                 "javaOptions": ["-Xmx6G"],
                 "vcfs": [
                     f"{remote_dir}/NA12878-BRCA1.norm.vcf",
                 ],
             },
             output=Vcf.basic_test(
                 "out",
                 51615,
                 221,
                 ["GATKCommandLine"],
                 "b7acb0a9900713cc7da7aeed5160c971",
             ),
         )
     ]
Esempio n. 18
0
 def tests(self):
     return [
         TTestCase(
             name="basic",
             input={
                 "bam":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.recalibrated.bam",
                 ),
                 "intervals":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "BRCA1.hg38.bed",
                 ),
                 "reference":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.chr17.fasta",
                 ),
                 "snps_dbsnp":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz",
                 ),
                 "haplotype_caller_pairHmmImplementation":
                 "LOGLESS_CACHING",
             },
             output=Vcf.basic_test(
                 "out",
                 51000,
                 221,
                 ["GATKCommandLine"],
                 "5e48624cb5ef379a7d6d39cec44bc856",
             ),
         )
     ]
Esempio n. 19
0
 def tests(self):
     parent_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics"
     somatic_data = f"{parent_dir}/wgssomatic_data"
     return [
         TTestCase(
             name="basic",
             input={
                 "vcf": f"{somatic_data}/stdout.norm.vcf",
                 "removeFileteredAll": True,
                 "recode": True,
                 "recodeINFOAll": True,
             },
             output=Vcf.basic_test(
                 "out",
                 34393,
                 147,
                 ["GATKCommandLine"],
                 "c083775bc8c49397fb65ec12cd435688",
             ),
         ),
     ]
Esempio n. 20
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("piscesVersion", String()),
         ToolInput(
             "inputVcf",
             Vcf(),
             prefix="--vcf",
             position=4,
             shell_quote=False,
             doc="Input VCF file",
         ),
         ToolInput(
             "outputDir",
             String(),
             prefix="--outfolder",
             position=5,
             shell_quote=False,
             doc="Output directory",
         ),
         *self.vqr_additional_args,
     ]
    def constructor(self):

        self.input("vcf", Vcf)
        self.input("truth", VcfIdx)
        self.input("intervals", Array(Vcf()))

        self.step("bgzip", BGZip_1_2_1(file=self.vcf))
        self.step("tabix", Tabix_1_2_1(file=self.bgzip))
        self.step(
            "genotypeConcord",
            Gatk4GenotypeConcordanceLatest(
                callVCF=self.tabix,
                truthVCF=self.truth,
                intervals=self.intervals,
                treatMissingSitesAsHomeRef=True,
            ),
        )

        self.output("summaryMetrics",
                    source=self.genotypeConcord.summaryMetrics)
        self.output("detailMetrics", source=self.genotypeConcord.detailMetrics)
        self.output("contingencyMetrics",
                    source=self.genotypeConcord.contingencyMetrics)
Esempio n. 22
0
 def tests(self):
     return [
         TTestCase(
             name="basic",
             input={
                 "javaOptions": ["-Xmx6G"],
                 "vcfs": [
                     os.path.join(
                         BioinformaticsTool.test_data_path(),
                         "wgsgermline_data",
                         "NA12878-BRCA1.norm.vcf",
                     )
                 ],
             },
             output=Vcf.basic_test(
                 "out",
                 51615,
                 221,
                 ["GATKCommandLine"],
                 "b7acb0a9900713cc7da7aeed5160c971",
             ),
         )
     ]
Esempio n. 23
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             "reference",
             FastaWithIndexes(),
             prefix="--reference",
             position=2,
             shell_quote=False,
             doc="Reference Genome FASTA",
         ),
         ToolInput(
             "inputBam",
             BamBai(),
             prefix="--input",
             position=2,
             shell_quote=False,
             doc="Input BAM",
         ),
         ToolInput(
             "inputVcf",
             Vcf(),
             prefix="--variant",
             position=2,
             shell_quote=False,
             doc="Input VCF",
         ),
         ToolInput(
             "outputVcf",
             Filename(),
             prefix="--output",
             position=2,
             shell_quote=False,
             doc="Output VCF filename",
         ),
         *self.additional_variant_annotator_args,
     ]
Esempio n. 24
0
 def tests(self):
     remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data"
     return [
         TTestCase(
             name="basic",
             input={
                 "bam": f"{remote_dir}/NA12878-BRCA1.recalibrated.bam",
                 "intervals": f"{remote_dir}/BRCA1.hg38.bed",
                 "reference":
                 f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta",
                 "snps_dbsnp":
                 f"{remote_dir}/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz",
                 "haplotype_caller_pairHmmImplementation":
                 "LOGLESS_CACHING",
             },
             output=Vcf.basic_test(
                 "out",
                 51000,
                 221,
                 ["GATKCommandLine"],
                 "5e48624cb5ef379a7d6d39cec44bc856",
             ),
         )
     ]
Esempio n. 25
0
class PiscesVariantCallerBase(IlluminaToolBase):
    def tool(self) -> str:
        return "PiscesVariantCaller"

    def friendly_name(self) -> str:
        return "Pisces: Variant Caller"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Miriam M Yeung"],
            dateCreated=date(2021, 8, 19),
            dateUpdated=date(2021, 10, 12),
            institution="Illumina",
            doi=None,
            citation="",
            keywords=["Illumina", "Pisces", "Variant Caller"],
            documentationUrl="",
            documentation="Calls variants",
        )

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def base_command(self):
        return None

    def arguments(self):
        return [
            ToolArgument("export TMPDIR=/tmp;", position=1, shell_quote=False),
            ToolArgument("dotnet", position=2, shell_quote=False),
            ToolArgument(
                StringFormatter(
                    "/app/Pisces_v{PISCES_VERSION}/Pisces.dll",
                    PISCES_VERSION=InputSelector("piscesVersion"),
                ),
                position=3,
                shell_quote=False,
            ),
        ]

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("piscesVersion", String()),
            ToolInput(
                "inputBam",
                BamBai(),
                prefix="-b",
                position=4,
                shell_quote=False,
                doc="Input BAM file",
            ),
            ToolInput(
                "referenceFolder",
                Directory(),
                prefix="--genomefolders",
                position=5,
                shell_quote=False,
                doc="Folder containing reference genome files",
            ),
            ToolInput(
                "outputDir",
                String(),
                prefix="--outfolder",
                position=4,
                shell_quote=False,
                doc="Output directory",
            ),
            ToolInput(
                "intervalBedFile",
                Bed(optional=True),
                prefix="-i",
                position=5,
                shell_quote=False,
                doc="Bed File denoting regions to call variants.",
            ),
            ToolInput(
                "minimumBaseQuality",
                Int(optional=True),
                prefix="--minbq",
                position=5,
                shell_quote=False,
                default=20,
                doc="Minimum base call quality to use base in read. (Default: 20)",
            ),
            ToolInput(
                "callMNVs",
                String(optional=True),
                prefix="--callmnvs",
                position=5,
                shell_quote=False,
                doc="Call Multi Nucleotide Variants (aka Phased SNPs). (Default: false)",
            ),
            ToolInput(
                "outputSBFiles",
                String(optional=True),
                prefix="--outputsbfiles",
                position=5,
                shell_quote=False,
                doc="Boolean Flag to output strand bias files. (Default: false)",
            ),
            *self.pisces_additional_args,
        ]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("vcf", Vcf(), glob=WildcardSelector("*.vcf")),
            ToolOutput(
                "used_options",
                File(optional=True),
                glob=WildcardSelector("PiscesLogs/*.json"),
            ),
            ToolOutput(
                "strandmetrics",
                File(optional=True),
                glob=WildcardSelector("*ReadStrandBias.txt"),
            ),
        ]

    pisces_additional_args = [
        ToolInput(
            "forcedAlleles",
            Vcf(optional=True),
            prefix="--forcedalleles",
            position=5,
            shell_quote=False,
            doc="Path to vcf of alleles where reporting is forced",
        ),
        ToolInput(
            "maxMNVLength",
            Int(optional=True),
            prefix="--maxmnvlength",
            position=5,
            shell_quote=False,
            doc="Maximum lenght of phased SNPs. Must be between 1 - 1000. (Default: 3)",
        ),
        ToolInput(
            "maxGapBetweenMNV",
            Int(optional=True),
            prefix="--maxgapbetweenmnv",
            position=5,
            shell_quote=False,
            doc="Maximum gap allowed between phased SNPs. Must be greater than 0. (Default: 1)",
        ),
        ToolInput(
            "collapseVariants",
            String(optional=True),
            prefix="--collapse",
            position=5,
            shell_quote=False,
            doc="Boolean flag for whether to collapse variants. (Default: true)",
        ),
        ToolInput(
            "collapseFreqThreshold",
            Float(optional=True),
            prefix="--collapsefreqthreshold",
            position=5,
            shell_quote=False,
            doc="when collapsing, minimum frequency of targetted variants. (Default: 0)",
        ),
        ToolInput(
            "collpaseFreqRatioThreshold",
            Float(optional=True),
            prefix="--collapsefreqratiothreshold",
            position=5,
            shell_quote=False,
            doc="When collapsing, minimum ratio requred of target variant frequency to collapsible variant frequency. (Default: 0.5)",
        ),
        ToolInput(
            "priorsPath",
            Vcf(optional=True),
            prefix="--priorspath",
            position=5,
            shell_quote=False,
            doc="Path to vcf file containing known variants, to preferentially reconcile collapsed variants",
        ),
        ToolInput(
            "trimMNVPriors",
            String(optional=True),
            prefix="--trimmnvpriors",
            position=5,
            shell_quote=False,
            doc="Boolean denoting if preceeding bases from the priorsPath VCF shoudl be trimmed. Note: COSMIC convention includeds preceeeding base for a MNV. (Default: false)",
        ),
        ToolInput(
            "coverageMethod",
            String(optional=True),
            prefix="--coveragemethod",
            position=5,
            shell_quote=False,
            doc="'approximate' or 'exact'. Exact is more precise and requires a minimum of 8GB of memory. (Default: approximate)",
        ),
        ToolInput(
            "baseLogName",
            String(optional=True),
            prefix="--baselogname",
            position=5,
            shell_quote=False,
            doc="",
        ),
        ToolInput(
            "debug",
            String(optional=True),
            prefix="-d",
            position=5,
            shell_quote=False,
            doc="Boolean flag for debugging",
        ),
        ToolInput(
            "useStitchedXD",
            String(optional=True),
            prefix="--usestitchedxd",
            position=5,
            shell_quote=False,
            doc="Boolean denoting whether the XD tag (stitched direction) is specified in the bam. ONLY USE IF USING GEMINI TO STITCH BAMS.",
        ),
        ToolInput(
            "trackedAnchorSize",
            Float(optional=True),
            prefix="--trackanchorsize",
            position=5,
            shell_quote=False,
            doc="Max size of anchor tor granularly track, when collecting reference coverage at insertion sites. Higher values == more precise (Default: 5)",
        ),
        ToolInput(
            "chrFilter",
            String(optional=True),
            prefix="--chrfilter",
            position=5,
            shell_quote=False,
            doc="Chromosome to process, will filter out all other chromosomes from output if specified. (Default: None)",
        ),
        ToolInput(
            "outFolder",
            String(optional=True),
            prefix="-o",
            position=4,
            shell_quote=False,
            doc="Ouput folder path",
        ),
        ToolInput(
            "maxThreads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-t",
            position=4,
            shell_quote=False,
            doc="Maximum number of threads. (Default: 20)",
        ),
        ToolInput(
            "threadByChr",
            String(optional=True),
            prefix="--threadbychr",
            position=5,
            shell_quote=False,
            doc="Parallelize by chromosome. (Default: false)",
        ),
        ToolInput(
            "multiProcess",
            String(optional=True),
            prefix="--multiprocess",
            position=5,
            shell_quote=False,
            doc="When thread by chr, launch separate processes to parallelize. (Default: true)",
        ),
        ## Bam Filtering Options
        ToolInput(
            "minimumMappingQuality",
            Int(),
            prefix="--minmq",
            position=5,
            shell_quote=False,
            default=1,
            doc="Minimum mapping quality to use a read. (Default: 1)",
        ),
        ToolInput(
            "filterDuplicates",
            String(optional=True),
            prefix="--filterduplicates",
            position=5,
            shell_quote=False,
            doc="Boolean Flag to filter out reads marked as duplicates. (Default: true)",
        ),
        ToolInput(
            "onlyUseProperPairs",
            String(optional=True),
            prefix="--pp",
            position=5,
            shell_quote=False,
            doc="Boolean Flag to only use proper pairs. (Default: false)",
        ),
        ## Variant Calling Options
        ToolInput(
            "minimumVariantQualityScore",
            Int(optional=True),
            prefix="--minvariantqscore",
            position=5,
            shell_quote=False,
            doc="Minimum Variant Quality Score to report a variant. (Default: 20)",
        ),
        ToolInput(
            "minimumCoverage",
            Int(optional=True),
            prefix="--mindepth",
            position=5,
            shell_quote=False,
            doc="Minimum depth to call a variant. (Default: 10)",
        ),
        ToolInput(
            "minimumVariantFrequency",
            Float(optional=True),
            prefix="--minimumvariantfrequency",
            position=5,
            shell_quote=False,
            doc="Minimum variant frequency to call a variant. Must be between 0 and 1. (Default: 0.01)",
        ),
        ToolInput(
            "targetLODFrequency",
            Float(optional=True),
            prefix="--targetvf",
            position=5,
            shell_quote=False,
            doc="Target Frequency to call a variant (i.e. to target a 5% allele frequency, we must call down to 2.6%, to capture a 5% allelle 95% of the time). Parameter is used by the Somatic Genotyping Model",
        ),
        ToolInput(
            "variantQualityFilter",
            Int(optional=True),
            prefix="--variantqualityfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for variant quality score filter to report a variant as 'FilteredVariantQScore'. (Default: 30)",
        ),
        ToolInput(
            "minimumVariantFrequencyFilter",
            Float(optional=True),
            prefix="--minvariantfrequencyfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for variant frequency to report a variant as 'FilteredVariantFrequency'. (Default: None)",
        ),
        ToolInput(
            "genotypeQualityFilter",
            Int(optional=True),
            prefix="--gqfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for genotype quality, if below the threshold, variant is reported as 'FilteredGenotype'. Should be greater than 0. (Default: None)",
        ),
        ToolInput(
            "minimumDepthFilter",
            Int(optional=True),
            prefix="--mindepthfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for reporting variants as 'FilteredLowDepth', if below the given threshold. (Default: None)",
        ),
        ToolInput(
            "enableSingleStrandFilter",
            String(optional=True),
            prefix="--ssfilter",
            position=5,
            shell_quote=False,
            doc="Filter variants with coverage limited to a single strand with filter flag 'SB'",
        ),
        ToolInput(
            "strandBiasModel",
            String(optional=True),
            prefix="--sbmodel",
            position=5,
            shell_quote=False,
            doc="Strand Bias Mode. Must be 'poisson|extended'. (Default: extended)",
        ),
        ToolInput(
            "noiseLevelForQModel",
            Int(optional=True),
            prefix="--NoiseLevelForQModel",
            position=5,
            shell_quote=False,
            doc="Noise Level to be used in the quality model for a variant quality score. Which is used to determine false positives. Must be >= 0. (Default: minimum base quality)\nNOTE: If this value is greater than the minBQ, it implies that the variant calls have higher confidence than the recorded BQ.",
        ),
        ToolInput(
            "ploidy",
            String(optional=True),
            prefix="--ploidy",
            position=5,
            shell_quote=False,
            doc="Ploidy model to determine the genotype of variant. Select from 'somatic|diploid|DiploidByAdaptiveGT'. (Default: somatic)",
        ),
        ToolInput(
            "diploidSNVGenotypeParameters",
            String(optional=True),
            prefix="--diploidsnvgenotypeparameters",
            position=5,
            shell_quote=False,
            doc="Comma-separated List of 3 floats in the format A,B,C. All must be between 0 and 1. A = Minimum Allelle frequence to be detected as 0/1(heterozygous), B = Maximum Allele frequence to be detected as 0/1, C = Minimum value for the sum of allells 1 and 2 (i.e. if C is not met the sit is flagged as 'Multiallelic'). (Default: 0.20, 0.70, 0.80)",
        ),
        ToolInput(
            "diploidIndelGenotypeParameters",
            String(optional=True),
            prefix="--diploidindelgenotypeparameters",
            position=5,
            shell_quote=False,
            doc="Comma-separated List of 3 floats in the format A,B,C. All must be between 0 and 1. A = Minimum Allelle frequence to be detected as 0/1(heterozygous), B = Maximum Allele frequence to be detected as 0/1, C = Minimum value for the sum of allells 1 and 2 (i.e. if C is not met the sit is flagged as 'Multiallelic'). (Default: 0.20, 0.70, 0.80)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersSNV",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_snvmodel",
            position=5,
            shell_quote=False,
            doc="Comma-separated list of 4 floats in the format A,B,C,D. (Default: 0.034,0.167,0.499,0.998)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersIndel",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_indelmodel",
            position=5,
            shell_quote=False,
            doc="(Default: 0.037,0.443,0.905)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersSNVPrior",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_snvprior",
            position=5,
            shell_quote=False,
            doc="(Default: 0.729,0.044,0.141,0.087)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersIndelPrior",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_indelprior",
            position=5,
            shell_quote=False,
            doc="(Default: 0.962,0.0266,0.0114)",
        ),
        ToolInput(
            "maximumVariantQualityScore",
            Int(optional=True),
            prefix="--maxvq",
            position=5,
            shell_quote=False,
            doc="Maximum variant quality score possible. (Default: 100)",
        ),
        ToolInput(
            "maximumGenotypeQualityScore",
            Int(optional=True),
            prefix="--maxgq",
            position=5,
            shell_quote=False,
            doc="Maximum genotype quality score possible. (Default: 100)",
        ),
        ToolInput(
            "maximumGenotypePosteriorScore",
            Int(optional=True),
            prefix="--maxgp",
            position=5,
            shell_quote=False,
            doc="Maximum Genotype Posterior score. (Default: 300)",
        ),
        ToolInput(
            "minimumGenotypeQualityScore",
            Int(optional=True),
            prefix="--mingq",
            position=5,
            shell_quote=False,
            doc="Minimum genotype quality score. (Default: 0)",
        ),
        ToolInput(
            "RMxNFilter",
            String(optional=True),
            prefix="--rmxnfilter",
            position=5,
            shell_quote=False,
            doc="Comma-separated list in the format M,N,F, indicating the max length of a repeat region(M), the minimum number of repeatitions (N), to be applied if the variant frequency is less than (F). (Default: 5,8,0.20)",
        ),
        ToolInput(
            "noCallFilter",
            Float(optional=True),
            prefix="--ncfilter",
            position=5,
            shell_quote=False,
            doc="No Call rate filter",
        ),
        ## Vcf Writer options
        ToolInput(
            "gVCF",
            String(optional=True),
            prefix="--gvcf",
            position=5,
            shell_quote=False,
            doc="Output as a gVCF. (Default: false)",
        ),
        ToolInput(
            "crushVCF",
            String(optional=True),
            prefix="--crushvcf",
            position=5,
            shell_quote=False,
            doc="Crush vcf output into one line per loci. (Default: false)",
        ),
        ToolInput(
            "reportNoCalls",
            String(optional=True),
            prefix="--reportnocalls",
            position=5,
            shell_quote=False,
            doc="Report the proportion of no-calls in the output. (Default: false)",
        ),
        ToolInput(
            "reportReadCollapsedReadCount",
            String(optional=True),
            prefix="--reportrccounts",
            position=5,
            shell_quote=False,
            doc="Debugging helper, when BAM files contain X1 & X2 tags, reports collapsed read counts for the categories 'duplex-stitched|duplex-nonstitched|simplex-stitched|simplex-nonstitched'. (Default: false)",
        ),
        ToolInput(
            "reportTemplateStrandCounts",
            String(optional=True),
            prefix="--reporttscounts",
            position=5,
            shell_quote=False,
            doc="Debugging helper, conditional on ReportRcCounts. Reports read counts for different template strands for the categories 'duplex-stitched|duplex-nonstitched|simplex-forward-stitched|simplex-forward-nonstitched|simplex-reverse-stitched|simplex-reverse-nonstitched''",
        ),
        ToolInput(
            "reportSuspiciousCoverageFraction",
            String(optional=True),
            prefix="--reportsuspiciouscoveragefraction",
            position=5,
            shell_quote=False,
            doc="Debugging helper, Reports the fraction of total coverage that is 'suspicious'. i.e. unanchored and bearing some resemblance to an insertion at the site. Note that for spanning varaints, this is start + end coverage, therefore up to double the coverage reported. (Default: false)",
        ),
    ]
Esempio n. 26
0
 def inputs(self) -> List[j.ToolInput]:
     return [
         j.ToolInput("truthVCF", Vcf(), position=1),
         j.ToolInput("compareVCF", Vcf(), position=2),
         j.ToolInput(
             "reportPrefix",
             j.Filename(),
             prefix="--report-prefix",
             doc="(-o)  Filename prefix for report output.",
         ),
         j.ToolInput(
             "reference",
             FastaWithDict(),
             prefix="--reference",
             doc="(-r)  Specify a reference file.",
         ),
         j.ToolInput(
             "intervals",
             Bed(optional=True),
             prefix="--target-regions",
             doc=
             "(-T)  Restrict analysis to given (dense) regions (using -T in bcftools).",
         ),
         j.ToolInput(
             "version",
             j.Boolean(optional=True),
             prefix="--version",
             doc="(-v) Show version number and exit.",
         ),
         j.ToolInput(
             "scratchPrefix",
             j.String(optional=True),
             prefix="--scratch-prefix",
             doc="Directory for scratch files.",
         ),
         j.ToolInput(
             "keepScratch",
             j.String(optional=True),
             prefix="--keep-scratch",
             doc=
             "Filename prefix for scratch report output. Annotation format in input VCF file.",
         ),
         j.ToolInput(
             "falsePositives",
             Bed(optional=True),
             prefix="--false-positives",
             doc=
             "(-f)  False positive / confident call regions (.bed or .bed.gz). "
             "Calls outside these regions will be labelled as UNK.",
         ),
         j.ToolInput(
             "stratification",
             Tsv(optional=True),
             prefix="--stratification",
             doc=
             " Stratification file list (TSV format -- first column is region name, "
             "second column is file name).",
         ),
         j.ToolInput(
             "stratificationRegion",
             j.String(optional=True),
             prefix="--stratification-region",
             doc=
             "Add single stratification region, e.g. --stratification-region TEST:test.bed",
         ),
         j.ToolInput(
             "stratificationFixchr",
             j.String(optional=True),
             prefix="--stratification-fixchr",
             doc=" Add chr prefix to stratification files if necessary",
         ),
         j.ToolInput(
             "writeVcf",
             j.Boolean(optional=True),
             prefix="--write-vcf",
             doc="(-V) Write an annotated VCF.",
         ),
         j.ToolInput(
             "writeCounts",
             j.Boolean(optional=True),
             prefix="--write-counts",
             doc="(-X) Write advanced counts and metrics.",
         ),
         j.ToolInput(
             "noWriteCounts",
             j.Boolean(optional=True),
             prefix="--no-write-counts",
             doc="Do not write advanced counts and metrics.",
         ),
         j.ToolInput(
             "outputVtc",
             j.Boolean(optional=True),
             prefix="--output-vtc",
             doc=
             "Write VTC field in the final VCF which gives the counts each position has contributed to.",
         ),
         j.ToolInput(
             "preserveInfo",
             j.Boolean(optional=True),
             prefix="--preserve-info",
             doc=
             "When using XCMP, preserve and merge the INFO fields in truth and query. "
             "Useful for ROC computation.",
         ),
         j.ToolInput(
             "roc",
             j.String(optional=True),
             prefix="--roc",
             doc=
             "Select a feature to produce a ROC on (INFO feature, QUAL, GQX, ...).",
         ),
         j.ToolInput(
             "noRoc",
             j.Boolean(optional=True),
             prefix="--no-roc",
             doc=
             "Disable ROC computation and only output summary statistics for more concise output.",
         ),
         j.ToolInput(
             "rocRegions",
             j.String(optional=True),
             prefix="--roc-regions",
             doc=" Select a list of regions to compute ROCs in. By default, "
             "only the '*' region will produce ROC output (aggregate variant counts).",
         ),
         j.ToolInput(
             "rocFilter",
             j.String(optional=True),
             prefix="--roc-filter",
             doc=" Select a filter to ignore when making ROCs.",
         ),
         j.ToolInput(
             "rocDelta",
             j.Int(optional=True),
             prefix="--roc-delta",
             doc=" Minimum spacing between ROC QQ levels.",
         ),
         j.ToolInput(
             "ciAlpha",
             j.Int(optional=True),
             prefix="--ci-alpha",
             doc=
             "Confidence level for Jeffrey's CI for recall, precision and fraction of non-assessed calls.",
         ),
         j.ToolInput(
             "noJson",
             j.Boolean(optional=True),
             prefix="--no-json",
             doc="Disable JSON file output.",
         ),
         # j.ToolInput("location", Array(j.String(), optional=True), prefix="--location", separator=",",
         #           doc="(-l)  Comma-separated list of locations [use naming after preprocessing], "
         #               "when not specified will use whole VCF."),
         j.ToolInput(
             "passOnly",
             j.Boolean(optional=True),
             prefix="--pass-only",
             doc="Keep only PASS variants.",
         ),
         # j.ToolInput("filtersOnly", Array(j.String(), optional=True), prefix="--filters-only", separator=",",
         #           doc=" Specify a comma-separated list of filters to apply "
         #               "(by default all filters are ignored / passed on."),
         j.ToolInput(
             "restrictRegions",
             j.Boolean(optional=True),
             prefix="--restrict-regions",
             doc=
             "(-R)  Restrict analysis to given (sparse) regions (using -R in bcftools).",
         ),
         j.ToolInput(
             "leftshift",
             j.Boolean(optional=True),
             prefix="--leftshift",
             doc="(-L) Left-shift variants safely.",
         ),
         j.ToolInput(
             "noLeftshift",
             j.Boolean(optional=True),
             prefix="--no-leftshift",
             doc="Do not left-shift variants safely.",
         ),
         j.ToolInput(
             "decompose",
             j.Boolean(optional=True),
             prefix="--decompose",
             doc=
             "Decompose variants into primitives. This results in more granular counts.",
         ),
         j.ToolInput(
             "noDecompose",
             j.Boolean(optional=True),
             prefix="--no-decompose",
             doc="(-D) Do not decompose variants into primitives.",
         ),
         j.ToolInput(
             "bcftoolsNorm",
             j.Boolean(optional=True),
             prefix="--bcftools-norm",
             doc="Enable preprocessing through bcftools norm -c x -D "
             "(requires external preprocessing to be switched on).",
         ),
         j.ToolInput(
             "fixchr",
             j.Boolean(optional=True),
             prefix="--fixchr",
             doc=
             "Add chr prefix to VCF records where necessary (default: auto, attempt to match reference).",
         ),
         j.ToolInput(
             "noFixchr",
             j.Boolean(optional=True),
             prefix="--no-fixchr",
             doc=
             "Do not add chr prefix to VCF records (default: auto, attempt to match reference).",
         ),
         j.ToolInput(
             "bcf",
             j.Boolean(optional=True),
             prefix="--bcf",
             doc=
             "Use BCF internally. This is the default when the input file is in BCF format already. "
             "Using BCF can speed up temp file access, but may fail for VCF files that have broken "
             "headers or records that don't comply with the header.",
         ),
         j.ToolInput(
             "somatic",
             j.Boolean(optional=True),
             prefix="--somatic",
             doc=
             "Assume the input file is a somatic call file and squash all columns into one, "
             "putting all FORMATs into INFO + use half genotypes (see also --set-gt). "
             "This will replace all sample columns and replace them with a single one. "
             "This is used to treat Strelka somatic files Possible values for this parameter: "
             "half / hemi / het / hom / half to assign one of the following genotypes to the "
             "resulting sample: 1 | 0/1 | 1/1 | ./1. This will replace all sample columns and "
             "replace them with a single one.",
         ),
         j.ToolInput(
             "setGT",
             j.Boolean(optional=True),
             prefix="--set-gt",
             doc=
             "This is used to treat Strelka somatic files Possible values for this parameter: "
             "half / hemi / het / hom / half to assign one of the following genotypes to the resulting "
             "sample: 1 | 0/1 | 1/1 | ./1. "
             "This will replace all sample columns and replace them with a single one.",
         ),
         j.ToolInput(
             "gender",
             j.String(optional=True),
             prefix="--gender",
             doc=
             "({male,female,auto,none})  Specify gender. This determines how haploid calls on chrX "
             "get treated: for male samples, all non-ref calls (in the truthset only when "
             "running through hap.py) are given a 1/1 genotype.",
         ),
         j.ToolInput(
             "preprocessTruth",
             j.Boolean(optional=True),
             prefix="--preprocess-truth",
             doc="Preprocess truth file with same settings as query "
             "(default is to accept truth in original format).",
         ),
         j.ToolInput(
             "usefilteredTruth",
             j.Boolean(optional=True),
             prefix="--usefiltered-truth",
             doc="Use filtered variant calls in truth file "
             "(by default, only PASS calls in the truth file are used)",
         ),
         j.ToolInput(
             "preprocessingWindowSize",
             j.Boolean(optional=True),
             prefix="--preprocessing-window-size",
             doc=" Preprocessing window size (variants further apart than "
             "that size are not expected to interfere).",
         ),
         j.ToolInput(
             "adjustConfRegions",
             j.Boolean(optional=True),
             prefix="--adjust-conf-regions",
             doc=
             " Adjust confident regions to include variant locations. Note this will only include "
             "variants that are included in the CONF regions already when viewing with bcftools; "
             "this option only makes sure insertions are padded correctly in the CONF regions (to "
             "capture these, both the base before and after must be contained in the bed file).",
         ),
         j.ToolInput(
             "noAdjustConfRegions",
             j.Boolean(optional=True),
             prefix="--no-adjust-conf-regions",
             doc=" Do not adjust confident regions for insertions.",
         ),
         j.ToolInput(
             "noHaplotypeComparison",
             j.Boolean(optional=True),
             prefix="--no-haplotype-comparison",
             doc=
             "(--unhappy)  Disable haplotype comparison (only count direct GT matches as TP).",
         ),
         j.ToolInput(
             "windowSize",
             j.Int(optional=True),
             prefix="--window-size",
             doc=
             "(-w)  Minimum distance between variants such that they fall into the same superlocus.",
         ),
         j.ToolInput(
             "xcmpEnumerationThreshold",
             j.Int(optional=True),
             prefix="--xcmp-enumeration-threshold",
             doc=
             " Enumeration threshold / maximum number of sequences to enumerate per block.",
         ),
         j.ToolInput(
             "xcmpExpandHapblocks",
             j.String(optional=True),
             prefix="--xcmp-expand-hapblocks",
             doc=
             " Expand haplotype blocks by this many basepairs left and right.",
         ),
         j.ToolInput(
             "threads",
             j.Int(optional=True),
             prefix="--threads",
             default=j.CpuSelector(),
             doc="Number of threads to use. Comparison engine to use.",
         ),
         # j.ToolInput("engineVcfevalPath", j.String(optional=True), prefix="--engine-vcfeval-path",
         #           doc=" This parameter should give the path to the \"rtg\" executable. "
         #               "The default is /opt/hap.py/lib/python27/Haplo/../../../libexec/rtg- tools-install/rtg"),
         j.ToolInput(
             "engine",
             j.String(optional=True),
             prefix="--engine",
             doc=
             " {xcmp,vcfeval,scmp-somatic,scmp-distance} Comparison engine to use.",
         ),
         j.ToolInput(
             "engineVcfevalTemplate",
             j.String(optional=True),
             prefix="--engine-vcfeval-template",
             doc=
             " Vcfeval needs the reference sequence formatted in its own file format (SDF -- run rtg "
             "format -o ref.SDF ref.fa). You can specify this here to save time when running hap.py "
             "with vcfeval. If no SDF folder is specified, hap.py will create a temporary one.",
         ),
         j.ToolInput(
             "scmpDistance",
             j.Int(optional=True),
             prefix="--scmp-distance",
             doc=
             " For distance-based matching, this is the distance between variants to use.",
         ),
         j.ToolInput(
             "logfile",
             j.Filename(suffix="-log", extension=".txt"),
             prefix="--logfile",
             doc="Write logging information into file rather than to stderr",
         ),
         j.ToolInput(
             "verbose",
             j.Boolean(optional=True),
             prefix="--verbose",
             doc="Raise logging level from warning to info.",
         ),
         j.ToolInput(
             "quiet",
             j.Boolean(optional=True),
             prefix="--quiet",
             doc="Set logging level to output errors only.",
         ),
     ]
Esempio n. 27
0
    def tool_modifier(self, tool: Tool, inputs: Dict,
                      hints: Dict[str, str]) -> Tool:
        from janis_bioinformatics.data_types import FastaWithDict, Vcf, Bed
        from janis_bioinformatics.tools.illumina import HapPyValidator_0_3_9

        failed_outputs, untyped_outputs = ensure_outputs_are_in_workflow_and_are_compatible(
            tool, self.validation.fields, Vcf())

        if len(failed_outputs) > 0:
            raise Exception(
                f"Some outputs for validation were not found in the tool '{tool.id()}': "
                f"{', '.join(failed_outputs)}")

        if len(untyped_outputs) > 0:
            Logger.critical(
                f"Some outputs for validation from the tool '{tool.id()}' were not "
                f"compatible with VCF: {', '.join(untyped_outputs)}")

        w = WorkflowBuilder(tool.id() + "_validated")

        w.input("validatorReference",
                FastaWithDict,
                value=self.validation.reference)
        w.input("validatorTruthVCF", Vcf, value=self.validation.truthVCF)
        w.input("validatorIntervals",
                Bed(optional=True),
                value=self.validation.intervals)

        inpdict = {
            i.id(): w.input(i.id(), i.intype)
            for i in tool.tool_inputs()
        }
        toolstp = w.step(tool.id(), tool(**inpdict))

        if isinstance(tool, Workflow):
            wf: Workflow = tool
            for o in wf.output_nodes.values():
                w.output(
                    identifier=o.id(),
                    source=toolstp[o.id()],
                    output_folder=o.output_folder,
                    output_name=o.output_name,
                )
        else:
            for o in tool.tool_outputs():
                w.output(identifier=o.id(), source=toolstp[o.id()])

        for o in self.validation.fields:

            sid = "validator_" + o
            valstp = w.step(
                sid,
                HapPyValidator_0_3_9(
                    compareVCF=toolstp[o],
                    reportPrefix=
                    o,  # this will generate an input node with format validator_{o}_reportPrefix
                    reference=w.validatorReference,
                    truthVCF=w.validatorTruthVCF,
                    intervals=w.validatorIntervals,
                ),
            )

            # Connect all the outputs of the validator to an output
            for vo in valstp.tool.outputs():
                w.output(
                    f"validated_{o}_{vo.id()}",
                    source=valstp[vo.id()],
                    output_folder="validated",
                )

        return w
Esempio n. 28
0
 def outputs(self) -> List[ToolOutput]:
     return [
         ToolOutput("vcf", Vcf(), InputSelector("outputFilename")),
         (ToolOutput("tsv", Tsv(), InputSelector("regions"))),
     ]
 def outputs(self) -> List[ToolOutput]:
     return [ToolOutput("out", Vcf(), glob=InputSelector("outputFilename"))]
Esempio n. 30
0
class Gatk4GenotypeConcordanceBase(Gatk4ToolBase, ABC):
    @classmethod
    def gatk_command(cls):
        return "GenotypeConcordance"

    def tool(self):
        return "Gatk4GenotypeConcordance"

    def friendly_name(self):
        return "GATK4: Genotype Concordance"

    def inputs(self):
        return [
            *super().inputs(),
            ToolInput(
                "callVCF",
                VcfTabix(),
                prefix="--CALL_VCF",
                doc="The VCF containing the call sample",
            ),
            ToolInput(
                "truthVCF",
                VcfIdx(),
                prefix="--TRUTH_VCF",
                doc="The VCF containing the truth sample",
            ),
            ToolInput(
                "outputBasename",
                Filename(),
                prefix="--OUTPUT",
                doc="Basename for the three metrics files that are to be written. Resulting files will be:"
                "(1) .genotype_concordance_summary_metrics, "
                "(2) .genotype_concordance_detail_metrics, "
                "(3) .genotype_concordance_contingency_metrics.",
            ),
            # *super(Gatk4GenotypeConcordanceBase, self).inputs(),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "summaryMetrics",
                File(),
                glob=WildcardSelector(
                    "*.genotype_concordance_summary_metrics", select_first=True
                ),
            ),
            ToolOutput(
                "detailMetrics",
                File(),
                glob=WildcardSelector(
                    "*.genotype_concordance_detail_metrics", select_first=True
                ),
            ),
            ToolOutput(
                "contingencyMetrics",
                File(),
                glob=WildcardSelector(
                    "*.genotype_concordance_contingency_metrics", select_first=True
                ),
            ),
            # ToolOutput("vcf", VcfIdx(optional=True), glob=WildcardSelector("*.vcf"))
        ]

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2018, 12, 24),
            dateUpdated=date(2019, 1, 24),
            institution="Broad Institute",
            doi=None,
            citation="See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information",
            keywords=["gatk", "gatk4", "broad", "genotype concordance"],
            documentationUrl="https://software.broadinstitute.org/gatk/documentation/tooldocs/4.0.5.0/picard_vcf_GenotypeConcordance.php",
            documentation="""GenotypeConcordance (Picard)
            
Calculates the concordance between genotype data of one samples in each of two VCFs - one being 
considered the truth (or reference) the other being the call. The concordance is broken into 
separate results sections for SNPs and indels. Statistics are reported in three different files.

Summary
    Calculates the concordance between genotype data of one samples in each of two VCFs - one being 
    considered the truth (or reference) the other being the call. The concordance is broken into 
    separate results sections for SNPs and indels. Summary and detailed statistics are reported.

Details
    This tool evaluates the concordance between genotype calls for a sample in different callsets
    where one is being considered as the "truth" (aka standard, or reference) and the other as the 
    "call" that is being evaluated for accuracy. The Comparison can be restricted to a confidence 
    interval which is typically used in order to enable proper assessment of False Positives and 
    the False-Positive Rate (FPR).
 
Output Metrics:
    Output metrics consists of GenotypeConcordanceContingencyMetrics, GenotypeConcordanceSummaryMetrics, 
    and GenotypeConcordanceDetailMetrics. For each set of metrics, the data is broken into separate 
    sections for SNPs and INDELs. Note that only SNP and INDEL variants are considered, MNP, Symbolic, 
    and Mixed classes of variants are not included.

    GenotypeConcordanceContingencyMetrics enumerate the constituents of each contingent in a callset 
    including true-positive (TP), true-negative (TN), false-positive (FP), and false-negative (FN) calls.
    GenotypeConcordanceDetailMetrics include the numbers of SNPs and INDELs for each contingent genotype 
    as well as the number of validated genotypes.

    GenotypeConcordanceSummaryMetrics provide specific details for the variant caller performance 
    on a callset including values for sensitivity, specificity, and positive predictive values.


Useful definitions applicable to alleles and genotypes:
    - Truthset - A callset (typically in VCF format) containing variant calls and genotypes that have been 
        cross-validated with multiple technologies e.g. Genome In A Bottle Consortium (GIAB) (https://sites.stanford.edu/abms/giab)
    - TP - True-positives are variant sites that match against the truth-set
    - FP - False-positives are reference sites miscalled as variant
    - FN - False-negatives are variant sites miscalled as reference
    - TN - True-negatives are correctly called as reference
    - Validated genotypes - are TP sites where the exact genotype (HET or HOM-VAR) appears in the truth-set

VCF Output:
    - The concordance state will be stored in the CONC_ST tag in the INFO field
    - The truth sample name will be \"truth\" and call sample name will be \"call\"  
""".strip(),
        )

    additional_args = [
        ToolInput(
            "argumentsFile",
            Array(File(), optional=True),
            prefix="--arguments_file",
            position=10,
            doc="read one or more arguments files and add them to the command line",
        ),
        ToolInput(
            "callSample",
            String(optional=True),
            prefix="--CALL_SAMPLE",
            position=10,
            doc="The name of the call sample within the call VCF. Not required if only one sample exists.",
        ),
        ToolInput(
            "ignoreFilterStatus",
            Boolean(optional=True),
            prefix="--IGNORE_FILTER_STATUS",
            doc="Default is false. If true, filter status of sites will be ignored so that we "
            "include filtered sites when calculating genotype concordance.",
        ),
        ToolInput(
            "intersectIntervals",
            Boolean(optional=True),
            prefix="--INTERSECT_INTERVALS",
            doc="If true, multiple interval lists will be intersected. If false multiple lists will be unioned.",
        ),
        ToolInput(
            "intervals",
            Array(Vcf(), optional=True),
            prefix="--INTERVALS",
            doc="One or more interval list files that will be used to limit the genotype concordance. "
            "Note - if intervals are specified, the VCF files must be indexed.",
        ),
        ToolInput(
            "minDP",
            Float(optional=True),
            prefix="--MIN_DP",
            doc="Genotypes below this depth will have genotypes classified as LowDp.",
        ),
        ToolInput(
            "minGQ",
            Float(optional=True),
            prefix="--MIN_GQ",
            doc="Genotypes below this genotype quality will have genotypes classified as LowGq.",
        ),
        ToolInput(
            "treatMissingSitesAsHomeRef",
            Boolean(optional=True),
            prefix="--MISSING_SITES_HOM_REF",
            doc="Default is false, which follows the GA4GH Scheme. If true, missing sites in the truth \n"
            "set will be treated as HOM_REF sites and sites missing in both the truth and call sets "
            "will be true negatives. Useful when hom ref sites are left out of the truth set. "
            "This flag can only be used with a high confidence interval list.",
        ),
        ToolInput(
            "outputAllRows",
            Boolean(optional=True),
            prefix="--OUTPUT_ALL_ROWS",
            doc="If true, output all rows in detailed statistics even when count == 0. When false only "
            "output rows with non-zero counts.",
        ),
        ToolInput(
            "outputVcf",
            Boolean(optional=True),
            prefix="--OUTPUT_VCF",
            doc="Output a VCF annotated with concordance information.",
        ),
        ToolInput(
            "truthSample",
            String(optional=True),
            prefix="--TRUTH_SAMPLE",
            doc="The name of the truth sample within the truth VCF. Not required if only one sample exists.",
        ),
        ToolInput(
            "useVcfIndex",
            Boolean(optional=True),
            prefix="--USE_VCF_INDEX",
            doc="If true, use the VCF index, else iterate over the entire VCF",
        ),
        ToolInput(
            "compressionLevel",
            Int(optional=True),
            prefix="--COMPRESSION_LEVEL",
            position=11,
            doc="Compression level for all compressed files created (e.g. BAM and GELI).",
        ),
        ToolInput(
            "createIndex",
            Boolean(optional=True),
            prefix="--CREATE_INDEX",
            position=11,
            doc="Whether to create a BAM index when writing a coordinate-sorted BAM file.",
        ),
        ToolInput(
            "createMd5File",
            Boolean(optional=True),
            prefix="--CREATE_MD5_FILE",
            position=11,
            doc="Whether to create an MD5 digest for any BAM or FASTQ files created.",
        ),
        ToolInput(
            "maxRecordsInRam",
            Int(optional=True),
            prefix="--MAX_RECORDS_IN_RAM",
            position=11,
            doc="When writing SAM files that need to be sorted, this will specify the number of "
            "records stored in RAM before spilling to disk. Increasing this number reduces "
            "the number of file handles needed to sort a SAM file, and increases the amount of RAM needed.",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--QUIET",
            position=11,
            doc="Whether to suppress job-summary info on System.err.",
        ),
        ToolInput(
            "reference",
            File(optional=True),
            prefix="--REFERENCE_SEQUENCE",
            position=11,
            doc="Reference sequence file.",
        ),
        ToolInput(
            "tmpDir",
            String(optional=True),
            prefix="--TMP_DIR",
            position=11,
            default="/tmp/",
            doc="Undocumented option",
        ),
        ToolInput(
            "useJdkDeflater",
            Boolean(optional=True),
            prefix="--use_jdk_deflater",
            position=11,
            doc="Whether to use the JdkDeflater (as opposed to IntelDeflater)",
        ),
        ToolInput(
            "useJdkInflater",
            Boolean(optional=True),
            prefix="--use_jdk_inflater",
            position=11,
            doc="Whether to use the JdkInflater (as opposed to IntelInflater)",
        ),
        ToolInput(
            "validationStringency",
            String(optional=True),
            prefix="--VALIDATION_STRINGENCY",
            position=11,
            doc="Validation stringency for all SAM files read by this program. Setting stringency to SILENT "
            "can improve performance when processing a BAM file in which variable-length data "
            "(read, qualities, tags) do not otherwise need to be decoded."
            "The --VALIDATION_STRINGENCY argument is an enumerated type (ValidationStringency), "
            "which can have one of the following values: [STRICT, LENIENT, SILENT]",
        ),
        ToolInput(
            "verbosity",
            String(optional=True),
            prefix="--verbosity",
            position=11,
            doc="The --verbosity argument is an enumerated type (LogLevel), which can have "
            "one of the following values: [ERROR, WARNING, INFO, DEBUG]",
        ),
    ]