def tumor_normal_inputs(): return [ ToolInput( "tumor", BamBai(), position=6, prefix="-I", doc="BAM/SAM/CRAM file containing reads", ), ToolInput( "tumorName", String(), position=6, prefix="-tumor", doc= "BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode.", ), ToolInput( "normal", BamBai(), position=5, prefix="-I", doc="BAM/SAM/CRAM file containing reads", ), ToolInput( "normalName", String(), position=6, prefix="-normal", doc= "BAM sample name of normal. May be URL-encoded as output by GetSampleName with -encode.", ), ]
def inputs(self): return [ ToolInput( "ubam", BamBai(), prefix="--UNMAPPED_BAM", prefix_applies_to_all_elements=True, doc= "Original SAM or BAM file of unmapped reads, which must be in queryname order.", position=10, ), ToolInput( "bam", Array(BamBai()), prefix="--ALIGNED_BAM", prefix_applies_to_all_elements=True, doc="SAM or BAM file(s) with alignment data.", position=10, ), ToolInput( "reference", FastaWithDict(optional=True), prefix="--REFERENCE_SEQUENCE", position=10, doc="Reference sequence file.", ), ToolInput( "outputFilename", Filename(extension=".bam"), position=10, prefix="--OUTPUT", doc="Merged SAM or BAM file to write to.", ), *self.additional_args, ]
def tests(self) -> Optional[List[TTestCase]]: parent_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics" germline_data = f"{parent_dir}/wgsgermline_data" somatic_data = f"{parent_dir}/wgssomatic_data" return [ TTestCase( name="basic", input={ "normal_inputs": [[ f"{somatic_data}/NA24385-BRCA1_R1.fastq.gz", f"{somatic_data}/NA24385-BRCA1_R21.fastq.gz", ]], "normal_name": "NA24385-BRCA1", "tumor_inputs": [[ f"{somatic_data}/NA12878-NA24385-mixture-BRCA1_R1.fastq.gz", f"{somatic_data}/NA12878-NA24385-mixture-BRCA1_R2.fastq.gz", ]], "tumor_name": "NA12878-NA24385-mixture", "reference": f"{germline_data}/Homo_sapiens_assembly38.chr17.fasta", "gridss_blacklist": f"{somatic_data}/consensusBlacklist.hg38.chr17.bed", "gnomad": f"{somatic_data}/af-only-gnomad.hg38.BRCA1.vcf.gz", "gatk_intervals": [f"{germline_data}/BRCA1.hg38.bed"], "known_indels": f"{germline_data}/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz", "mills_indels": f"{germline_data}/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz", "snps_1000gp": f"{germline_data}/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz", "snps_dbsnp": f"{germline_data}/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz", "cutadapt_adapters": f"{germline_data}/contaminant_list.txt", }, output=BamBai.basic_test("out_normal_bam", 3265300, 49500) + BamBai.basic_test("out_tumor_bam", 3341700, 49000) + TextFile.basic_test( "out_normal_performance_summary", 950, md5="e3205735e5fe8c900f05050f8ed73f19", ) + TextFile.basic_test( "out_tumor_performance_summary", 950, md5="122bfa2ece90c0f030015feba4ba7d84", ) + FastqGzPair.basic_test("out_normal_fastqc_reports", 881300) + FastqGzPair.basic_test("out_tumor_fastqc_reports", 874900), ) ]
def tests(self): remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data" return [ TTestCase( name="basic", input={ "bams": [ f"{remote_dir}/NA12878-BRCA1.sorted.bam", ], "maxRecordsInRam": 5000000, "createIndex": True, "mergeSamFiles_useThreading": True, "mergeSamFiles_validationStringency": "SILENT", }, output=BamBai.basic_test( "out", 2829000, 3780, f"{remote_dir}/NA12878-BRCA1.markduped.bam.flagstat", ), ), TTestCase( name="minimal", input={ "bams": [ f"{remote_dir}/NA12878-BRCA1.sorted.bam", ], "maxRecordsInRam": 5000000, "createIndex": True, "mergeSamFiles_useThreading": True, "mergeSamFiles_validationStringency": "SILENT", }, output=self.minimal_test(), ), ]
def inputs(self): return [ *super(Gatk4CollectInsertSizeMetricsBase, self).inputs(), ToolInput( "bam", BamBai(optional=False), prefix="-I", doc="Input SAM or BAM file. Required.", position=10, ), ToolInput( "outputFilename", Filename( prefix=InputSelector("bam", remove_file_extension=True), extension=".txt", suffix=".metrics", ), prefix="-O", doc="File to write the output to. Required.", ), ToolInput( "outputHistogram", Filename( prefix=InputSelector("bam", remove_file_extension=True), extension=".pdf", suffix=".histogram", ), prefix="-H", doc="File to write insert size Histogram chart to. Required. ", ), *Gatk4CollectInsertSizeMetricsBase.additional_args, ]
def inputs(self): return [ *super(Gatk4DepthOfCoverageBase, self).inputs(), ToolInput( "bam", BamBai(), prefix="-I", doc="The SAM/BAM/CRAM file containing reads.", secondaries_present_as={".bai": "^.bai"}, ), ToolInput( "reference", FastaWithDict(), prefix="-R", doc="Reference sequence" ), ToolInput( "outputPrefix", String(), prefix="-O", doc="An output file created by the walker. Will overwrite contents if file exists", ), ToolInput( "intervals", Array(Bed), prefix="--intervals", doc="-L (BASE) One or more genomic intervals over which to operate", prefix_applies_to_all_elements=True, ), *self.additional_args, ]
def inputs(self): return [ ToolInput("bams", Array(BamBai()), position=10), ToolInput("reference", FastaWithDict(), position=1, prefix="--reference"), ToolInput( "outputFilename", Filename(suffix=".svs", extension=".vcf"), position=2, prefix="--output", ), ToolInput( "assemblyFilename", Filename(suffix=".assembled", extension=".bam"), position=3, prefix="--assembly", ), ToolInput("threads", Int(optional=True), default=CpuSelector(), prefix="--threads"), ToolInput("blacklist", Bed(optional=True), position=4, prefix="--blacklist"), ToolInput("tmpdir", String(optional=True), default="./TMP", prefix="--workingdir"), ]
def inputs(self): return [ *super().inputs(), *Gatk4GetPileUpSummariesBase.additional_args, ToolInput( "bam", Array(BamBai()), prefix="-I", prefix_applies_to_all_elements=True, doc="The SAM/BAM/CRAM file containing reads.", position=0, ), ToolInput( "sites", VcfTabix(), prefix="-V", doc="sites of common biallelic variants", ), ToolInput( "intervals", VcfTabix(optional=True), prefix="--intervals", doc= "-L (BASE) One or more genomic intervals over which to operate", ), ToolInput("pileupTableOut", Filename(extension=".txt"), position=1, prefix="-O"), ]
def tests(self): return [ TTestCase( name="basic", input={ "bam": [ os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "NA12878-BRCA1.merged.bam", ) ], "javaOptions": ["-Xmx6G"], "maxRecordsInRam": 5000000, "createIndex": True, "tmpDir": "./tmp", }, output=BamBai.basic_test( "out", 2829000, 3780, os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "NA12878-BRCA1.markduped.bam.flagstat", ), ) + TextFile.basic_test( "metrics", 3700, "NA12878-BRCA1\t193\t9468\t164\t193\t46\t7\t1\t0.003137\t7465518", 112, ), ) ]
def inputs(self) -> List[ToolInput]: return [ ToolInput("intervals", Bed(), position=2, shell_quote=False), ToolInput( "outputFilename", Filename(extension=".vcf", suffix=".vardict"), prefix=">", position=6, shell_quote=False, ), ToolInput( "bam", BamBai(), prefix="-b", position=1, shell_quote=False, doc="The indexed BAM file", ), ToolInput( "reference", FastaFai(), prefix="-G", position=1, shell_quote=False, doc="The reference fasta. Should be indexed (.fai). " "Defaults to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa", ), *VarDictGermlineBase.vardict_inputs, *VarDictGermlineBase.var2vcf_inputs, ]
def tests(self): return [ TTestCase( name="basic", input={ "bams": [ os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "NA12878-BRCA1.sorted.bam", ) ], "createIndex": True, "validationStringency": "SILENT", "javaOptions": ["-Xmx6G"], "maxRecordsInRam": 5000000, "tmpDir": "./tmp", "useThreading": True, }, output=BamBai.basic_test( "out", 2826968, 49688, os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "NA12878-BRCA1.bam.flagstat", ), "963a51f7feed5b829319b947961b8a3e", "231c10d0e43766170f5a7cd1b8a6d14e", ), ) ]
def inputs(self) -> List[ToolInput]: return [ ToolInput("piscesVersion", String()), ToolInput( "inputBam", BamBai(), prefix="-b", position=4, shell_quote=False, doc="Input BAM file", ), ToolInput( "outputDir", String(), prefix="--outfolder", position=4, shell_quote=False, doc="Output Folder", ), ToolInput( "referenceFolder", Directory(), prefix="--genomefolders", position=5, shell_quote=False, doc="Folder containing reference genome files", ), *self.additional_hygea_args, ]
def tests(self): return [ TTestCase( name="basic", input={ "bam": os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "NA12878-BRCA1.recalibrated.bam", ), "intervals": os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "BRCA1.hg38.bed", ), "javaOptions": ["-Xmx3G"], "outputFilename": ".", }, output=BamBai.basic_test( "out", 2600900, 21300, os.path.join( BioinformaticsTool.test_data_path(), "wgsgermline_data", "NA12878-BRCA1.split.flagstat", ), ), ) ]
def tests(self): remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data" return [ TTestCase( name="basic", input={ "inputRead": f"{remote_dir}/NA12878-BRCA1.split.bam", "reference": f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta", "intervals": f"{remote_dir}/BRCA1.hg38.bed", "dbsnp": f"{remote_dir}/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz", "javaOptions": ["-Xmx6G"], "pairHmmImplementation": "LOGLESS_CACHING", }, output=VcfTabix.basic_test( "out", 12800, 270, 214, ["GATKCommandLine"], "0224e24e5fc27286ee90c8d3c63373a7", ) + BamBai.basic_test( "bam", 596698, 21272, f"{remote_dir}/NA12878-BRCA1.haplotyped.flagstat", "d83b4c0d8eab24a3be1cc6af4f827753", "b4bb4028b8679a3a635e3ad87126a097", ), ) ]
def tests(self): remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data" return [ TTestCase( name="basic", input={ "bams": [ f"{remote_dir}/NA12878-BRCA1.sorted.bam", ], "createIndex": True, "validationStringency": "SILENT", "javaOptions": ["-Xmx6G"], "maxRecordsInRam": 5000000, "tmpDir": "./tmp", "useThreading": True, }, output=BamBai.basic_test( "out", 2826968, 49688, f"{remote_dir}/NA12878-BRCA1.bam.flagstat", "963a51f7feed5b829319b947961b8a3e", "231c10d0e43766170f5a7cd1b8a6d14e", ), ) ]
def inputs(self): return [ *super(Gatk4ApplyBqsrBase, self).inputs(), ToolInput( "bam", BamBai(), prefix="-I", doc="The SAM/BAM/CRAM file containing reads.", secondaries_present_as={".bai": "^.bai"}, position=10, ), ToolInput( "reference", FastaWithDict(), prefix="-R", doc="Reference sequence" ), ToolInput( "outputFilename", Filename(extension=".bam"), prefix="-O", doc="Write output to this file", ), ToolInput( "recalFile", Tsv(optional=True), prefix="--bqsr-recal-file", doc="Input recalibration table for BQSR", ), ToolInput( "intervals", Bed(optional=True), prefix="--intervals", doc="-L (BASE) One or more genomic intervals over which to operate", ), *self.additional_args, ]
def inputs(self): return [ ToolInput( "bam", BamBai(), prefix="-I", doc="Input file containing sequence data (BAM or CRAM)", secondaries_present_as={".bai": "^.bai"}, position=10, ), ToolInput("reference", FastaWithDict(), prefix="-R", doc="Reference sequence file"), ToolInput( "outputPrefix", String(), prefix="-o", doc= "An output file created by the walker. Will overwrite contents if file exists", ), ToolInput( "intervals", File(optional=True), prefix="-L", doc="One or more genomic intervals over which to operate", ), ToolInput( "excludeIntervals", File(optional=True), prefix="--excludeIntervals", doc="One or more genomic intervals to exclude from processing", ), *self.additional_args, ]
def tests(self): remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data" return [ TTestCase( name="basic", input={ "bam": [f"{remote_dir}/NA12878-BRCA1.merged.bam"], "javaOptions": ["-Xmx6G"], "maxRecordsInRam": 5000000, "createIndex": True, "tmpDir": "./tmp", }, output=BamBai.basic_test( "out", 2829000, 3780, f"{remote_dir}/NA12878-BRCA1.markduped.bam.flagstat", ) + TextFile.basic_test( "metrics", 3700, "NA12878-BRCA1\t193\t9468\t164\t193\t46\t7\t1\t0.003137\t7465518", 112, ), ) ]
def tests(self): remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data" return [ TTestCase( name="basic", input={ "bam": f"{remote_dir}/NA12878-BRCA1.markduped.bam", "reference": f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta", "recalFile": f"{remote_dir}/NA12878-BRCA1.markduped.table", "intervals": f"{remote_dir}/BRCA1.hg38.bed", }, output=BamBai.basic_test( "out", 2600000, 21000, f"{remote_dir}/NA12878-BRCA1.recalibrated.flagstat", ), ), TTestCase( name="minimal", input={ "bam": f"{remote_dir}/NA12878-BRCA1.markduped.bam", "reference": f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta", "recalFile": f"{remote_dir}/NA12878-BRCA1.markduped.table", "intervals": f"{remote_dir}/BRCA1.hg38.bed", }, output=self.minimal_test(), ), ]
def inputs(self): return [ *super().inputs(), ToolInput( "bams", Array(BamBai()), prefix="-I", prefix_applies_to_all_elements=True, doc="The SAM/BAM file to sort.", position=10, ), ToolInput( "sampleName", String(optional=True), doc="Used for naming purposes only", ), ToolInput( "outputFilename", Filename( prefix=InputSelector("sampleName"), suffix=".merged", extension=".bam", ), position=10, prefix="-O", doc="SAM/BAM file to write merged result to", ), *self.additional_args, ]
def outputs(self): return [ ToolOutput( "out", VcfTabix, glob=InputSelector("outputFilename"), doc="To determine type", ), ToolOutput( "stats", TextFile(extension=".stats"), glob=InputSelector("outputFilename") + ".stats", doc="To determine type", ), ToolOutput( "f1f2r_out", TarFileGz, glob=InputSelector("f1r2TarGz_outputFilename"), doc="To determine type", ), ToolOutput( "bam", BamBai(optional=True), glob=InputSelector("outputBamName"), doc="File to which assembled haplotypes should be written", secondaries_present_as={".bai": "^.bai"}, ), ]
def constructor(self): self.input("bams", Array(BamBai())) self.input("createIndex", Boolean, default=True) self.input("maxRecordsInRam", Int, default=5000000) self.input("sampleName", String(optional=True)) self.step( "mergeSamFiles", Gatk4MergeSamFiles_4_1_2( bams=self.bams, useThreading=True, createIndex=self.createIndex, maxRecordsInRam=self.maxRecordsInRam, validationStringency="SILENT", sampleName=self.sampleName, ), ) self.step( "markDuplicates", Gatk4MarkDuplicates_4_1_2( bam=self.mergeSamFiles.out, createIndex=self.createIndex, maxRecordsInRam=self.maxRecordsInRam, ), ) self.output("out", source=self.markDuplicates.out)
def inputs(self): return [ ToolInput( "bam", BamBai(), prefix="-I", position=10, secondaries_present_as={".bai": "^.bai"}, doc= "One or more input SAM or BAM files to analyze. Must be coordinate sorted.", ), ToolInput( "outputFilename", Filename(extension=".bam"), position=10, prefix="-O", doc="File to write duplication metrics to", ), ToolInput( "metricsFilename", Filename(extension=".metrics.txt"), position=10, prefix="-M", doc="The output file to write marked records to.", ), *super(Gatk4MarkDuplicatesBase, self).inputs(), *self.additional_args, ]
def inputs(self): return [ *super().inputs(), *Gatk4GetPileUpSummariesBase.additional_args, ToolInput( "bam", Array(BamBai()), prefix="-I", prefix_applies_to_all_elements=True, doc="The SAM/BAM/CRAM file containing reads.", position=0, ), ToolInput( "sampleName", String(optional=True), doc="Used for naming purposes" ), ToolInput( "sites", VcfTabix(), prefix="-V", doc="sites of common biallelic variants", ), ToolInput( "intervals", Bed(optional=True), prefix="--intervals", doc="-L (BASE) One or more genomic intervals over which to operate", ), ToolInput( "pileupTableOut", Filename( prefix=JoinOperator( FilterNullOperator( [ FirstOperator( [InputSelector("sampleName"), "generated"] ), # If( # IsDefined(InputSelector("intervals")), # InputSelector( # "intervals", remove_file_extension=True # ), # "", # ), ] ), ".", ), extension=".txt", ), position=1, prefix="-O", ), ToolInput( "reference", FastaWithDict(optional=True), prefix="-R", doc="reference to use when decoding CRAMS", ), ]
def inputs(self): return [ *super(Gatk4BaseRecalibratorBase, self).inputs(), *Gatk4BaseRecalibratorBase.additional_args, ToolInput( "bam", BamBai(), position=6, prefix="-I", doc="BAM/SAM/CRAM file containing reads", secondaries_present_as={".bai": "^.bai"}, ), ToolInput( "knownSites", Array(VcfTabix()), prefix="--known-sites", position=28, prefix_applies_to_all_elements=True, doc= "**One or more databases of known polymorphic sites used to exclude " "regions around known polymorphisms from analysis.** " "This algorithm treats every reference mismatch as an indication of error. However, real " "genetic variation is expected to mismatch the reference, so it is critical that a " "database of known polymorphic sites is given to the tool in order to skip over those sites. " "This tool accepts any number of Feature-containing files (VCF, BCF, BED, etc.) for use as " "this database. For users wishing to exclude an interval list of known variation simply " "use -XL my.interval.list to skip over processing those sites. Please note however " "that the statistics reported by the tool will not accurately reflected those sites " "skipped by the -XL argument.", ), ToolInput( "reference", FastaWithDict(), position=5, prefix="-R", doc="Reference sequence file", ), ToolInput( "outputFilename", Filename(prefix=InputSelector("bam"), extension=".table"), position=8, prefix="-O", doc="**The output recalibration table filename to create.** " "After the header, data records occur one per line until the end of the file. The first " "several items on a line are the values of the individual covariates and will change " "depending on which covariates were specified at runtime. The last three items are the " "data- that is, number of observations for this combination of covariates, number of " "reference mismatches, and the raw empirical quality score calculated by phred-scaling " "the mismatch rate. Use '/dev/stdout' to print to standard out.", ), ToolInput( "intervals", Bed(optional=True), prefix="--intervals", doc= "-L (BASE) One or more genomic intervals over which to operate", ), ]
def outputs(self): return [ ToolOutput( "out", BamBai(), glob=InputSelector("outputFilename"), secondaries_present_as={".bai": "^.bai"}, ) ]
def outputs(self) -> List[ToolOutput]: return [ ToolOutput("vcf", Vcf(), glob=InputSelector("outputFilename")), ToolOutput( "assembly", BamBai(), glob=InputSelector("assemblyFilename"), secondaries_present_as={".bai": "^.bai"}, ), ]
def outputs(self): return [ ToolOutput( "out", BamBai(), glob=InputSelector("outputFilename"), doc="BAM to write extracted reads to", secondaries_present_as={".bai": "^.bai"}, ) ]
def inputs(self): return [ *super(Gatk4HaplotypeCallerBase, self).inputs(), *Gatk4HaplotypeCallerBase.optional_args, ToolInput( "inputRead", BamBai(), doc="BAM/SAM/CRAM file containing reads", prefix="--input", secondaries_present_as={".bai": "^.bai"}, ), ToolInput( "reference", FastaWithDict(), position=5, prefix="--reference", doc="Reference sequence file", ), ToolInput( "outputFilename", Filename( prefix=InputSelector("inputRead", remove_file_extension=True), extension=".vcf.gz", ), position=8, prefix="--output", doc="File to which variants should be written", ), ToolInput( "dbsnp", VcfTabix(optional=True), position=7, prefix="--dbsnp", doc="(Also: -D) A dbSNP VCF file.", ), ToolInput( "intervals", Bed(optional=True), prefix="--intervals", doc= "-L (BASE) One or more genomic intervals over which to operate", ), ToolInput( "outputBamName", Filename( prefix=InputSelector("inputRead", remove_file_extension=True), extension=".bam", ), position=8, prefix="-bamout", doc="File to which assembled haplotypes should be written", ), ]
def outputs(self): return [ ToolOutput( "out", BamBai(), glob=InputSelector("outputFilename"), doc= "BAM file with reads split at N CIGAR elements and CIGAR strings updated.", secondaries_present_as={".bai": "^.bai"}, ) ]