def test_dot_4(self): w = WorkflowBuilder("sbmf") w.input("inp", Array(str)) w.input("inp2", Array(str)) w.input("inp3", Array(str)) w.input("inp4", Array(str)) step = w.step( "dotTool", SingleTestTool(inputs=w.inp, input2=w.inp2, input3=w.inp3, input4=w.inp4), scatter=ScatterDescription( fields=["inputs", "input2", "input3", "input4"], method=ScatterMethods.dot, ), ) outp = wdl.translate_step_node( step, "A.SingleTestTool", {}, {"inp", "inp2", "inp3", "inp4"} ) expected = """\ scatter (Q in zip(inp, zip(inp2, zip(inp3, inp4)))) { call A.SingleTestTool as dotTool { input: inputs=Q.left, input2=Q.right.left, input3=Q.right.right.left, input4=Q.right.right.right } }""" self.assertEqual(expected, outp.get_string(indent=0))
def inputs(self): return [ ToolInput("files", Array(File()), position=2, localise_file=True), ToolInput( "files2", Array(File(), optional=True), position=3, localise_file=True ), ToolInput("outputFilename", Filename(extension=".tar"), position=1), ]
def inputs(self): return [ *StarAlignerBase.additional_inputs, ToolInput("help", Boolean(optional=True), prefix="--help", doc="help page"), ToolInput( "runThreadN", Int(optional=True), default=CpuSelector(), prefix="--runThreadN", doc="int: number of threads to run STAR. Default: 1.", ), ToolInput( "genomeDir", Directory(optional=True), prefix="--genomeDir", doc="string: path to the directory where genome files are stored (for –runMode alignReads) or will be generated (for –runMode generateGenome). Default: ./GenomeDir", ), ToolInput( "readFilesIn", Array(FastqGz, optional=True), prefix="--readFilesIn", separator=",", doc="string(s): paths to files that contain input read1 (and, if needed, read2). Default: Read1,Read2.", ), ToolInput( "outFileNamePrefix", Filename(), prefix="--outFileNamePrefix", doc="string: output files name prefix (including full or relative path). Can only be defined on the command line.", ), ToolInput( "outSAMtype", Array(String(), optional=True), prefix="--outSAMtype", separator=" ", prefix_applies_to_all_elements=False, doc='strings: type of SAM/BAM output. 1st word: "BAM": outputBAMwithoutsorting, "SAM": outputSAMwithoutsorting, "None": no SAM/BAM output. 2nd,3rd: "Unsorted": standard unsorted. "SortedByCoordinate": sorted by coordinate. This option will allocate extra memory for sorting which can be specified by –limitBAMsortRAM.', ), ToolInput( "outSAMunmapped", String(optional=True), prefix="--outSAMunmapped", doc="string(s): output of unmapped reads in the SAM format", ), ToolInput( "outSAMattributes", String(optional=True), prefix="--outSAMattributes", doc="string: a string of desired SAM attributes, in the order desired for the output SAM", ), ToolInput( "readFilesCommand", String(optional=True), prefix="--readFilesCommand", doc="string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout", ), ]
def test_add_scatter_nested_arrays_incompatible(self): w = WorkflowBuilder("scatterededge") w.input("inp", Array(Array(int))) stp = w.step("stp", ArrayTestTool(inputs=w.inp), scatter="inputs") e = first_value(w.stp.sources["inputs"].source_map) self.assertFalse(e.compatible_types) self.assertListEqual(["inputs"], stp.scatter.fields)
def test_add_scatter_nested_arrays(self): w = WorkflowBuilder("scatterededge") w.input("inp", Array(Array(str))) stp = w.step("stp", ArrayTestTool(inps=w.inp), scatter="inps") e = w.stp.sources["inps"].source_map[0] self.assertTrue(e.compatible_types) self.assertListEqual(["inps"], stp.scatter.fields)
def test_array_of_array_of_strings(self): ar = Array(Array(String())) d = ar.cwl_type() self.assertDictEqual( { "type": "array", "items": { "type": "array", "items": "string" } }, d.save())
def outputs(self) -> List[ToolOutput]: return [ ToolOutput( "out", Array(ZipFile()), glob=WildcardSelector(wildcard="*.zip") ), ToolOutput( "datafile", Array(File), glob=WildcardSelector(wildcard="*/fastqc_data.txt"), ), ]
def inputs(self) -> List[ToolInput]: return [ ToolInput( "outputFilename", Filename(extension=".vcf", suffix=".combined"), prefix="-o", ), # deprecated # ToolInput( # "regions", # Filename(extension=".tsv"), # prefix="--regions", # doc="Region file containing all the variants, used as samtools mpileup", # ), ToolInput( "vcfs", Array(Vcf()), prefix="-i", prefix_applies_to_all_elements=True, doc= "input vcfs, the priority of the vcfs will be based on the order of the input", ), ToolInput("type", String(), prefix="--type", doc="germline | somatic"), ToolInput( "columns", Array(String(), optional=True), prefix="--columns", separator=",", doc="Columns to keep, seperated by space output vcf (unsorted)", ), ToolInput( "normal", String(optional=True), prefix="--normal", doc= "Sample id of germline vcf, or normal sample id of somatic vcf", ), ToolInput( "tumor", String(optional=True), prefix="--tumor", doc="tumor sample ID, required if inputs are somatic vcfs", ), ToolInput( "priority", Int(optional=True), prefix="--priority", doc= "The priority of the callers, must match with the callers in the source header", ), ]
def inputs(self) -> List[ToolInput]: return [ ToolInput("headerVcfs", Array(VcfTabix), position=1), ToolInput("contentVcfs", Array(VcfTabix), position=4), ToolInput( "outputFilename", Filename(extension=".vcf", suffix=".strelka"), prefix=">", position=6, shell_quote=False, ), ]
def test_array_of_array_of_strings(self): ar = Array(Array(String())) d = ar.cwl_type() self.assertEqual( d.get_dict(), { "type": "array", "items": { "type": "array", "items": "string" } }, )
def constructor(self): # Inputs self.input("sample_name", str) self.input("reference", FastaWithDict) self.input("reference_alt", File) self.input("fastq", FastqGzPair) # Pipe adapters self.input("cutadapt_adapter", Array(str, optional=True)) self.input("cutadapt_removeMiddle3Adapter", Array(str, optional=True)) # Steps self.step( "cutadapt", CutAdapt_2_4( fastq=self.fastq, adapter=self.cutadapt_adapter, front=None, removeMiddle5Adapter=None, removeMiddle3Adapter=self.cutadapt_removeMiddle3Adapter, qualityCutoff=15, minimumLength=50, outputPrefix=self.sample_name, ), ) self.step( "bwamempostalt", BwaMem_PostAlt_SamToolsView( reads=self.cutadapt.out, sampleName=self.sample_name, reference=self.reference, markShorterSplits=True, reference_alt=self.reference_alt, ), ) self.step( "sortsam", Gatk4SortSam_4_1_2( bam=self.bwamempostalt.out, sortOrder="coordinate", createIndex=True, validationStringency="SILENT", maxRecordsInRam=5000000, tmpDir=".", ), ) # Outputs self.output("out", source=self.sortsam)
def outputs(self): return [ ToolOutput( "unalignedReads", output_type=Array(FastqGz()), glob=WildcardSelector("*/*.fastq.gz"), ), ToolOutput("stats", output_type=Array(File()), glob=WildcardSelector("Stats/*")), ToolOutput("interop", output_type=Array(File()), glob=WildcardSelector("InterOp/*")), ]
def add_inputs(self): # INPUTS self.input("normal_inputs", Array(FastqGzPair), doc=INPUT_DOCS["normal_inputs"]) self.input("tumor_inputs", Array(FastqGzPair), doc=INPUT_DOCS["tumor_inputs"]) self.input("normal_name", String(), doc=INPUT_DOCS["normal_name"]) self.input("tumor_name", String(), doc=INPUT_DOCS["tumor_name"]) self.add_inputs_for_reference() self.add_inputs_for_intervals() self.add_inputs_for_configuration()
def inputs(self) -> List[ToolInput]: import uuid fastq_uuid = str(uuid.uuid1()) return [ ToolInput("fastq", FastqGzPair, position=5), ToolInput( "adapter", input_type=Array(String(), optional=True), prefix="-a", prefix_applies_to_all_elements=True, doc= "Sequence of an adapter ligated to the 3' end (paired data: of the first read). " "The adapter and subsequent bases are trimmed. If a '$' character is appended ('anchoring'), " "the adapter is only found if it is a suffix of the read.", ), ToolInput( "outputFilename", Filename(suffix="-R1", extension=".fastq.gz"), prefix="-o", doc= "Write trimmed reads to FILE. FASTQ or FASTA format is chosen depending on input. " "The summary report is sent to standard output. Use '{name}' in FILE to demultiplex " "reads into multiple files. Default: write to standard output", ), ToolInput( "secondReadFile", Filename(suffix="-R2", extension=".fastq.gz"), prefix="-p", doc="Write second read in a pair to FILE.", ), *self.additional_args, ]
def inputs(self): return [ *super().inputs(), ToolInput( "bams", Array(BamBai()), prefix="-I", prefix_applies_to_all_elements=True, doc="The SAM/BAM file to sort.", position=10, ), ToolInput( "sampleName", String(optional=True), doc="Used for naming purposes only", ), ToolInput( "outputFilename", Filename( prefix=InputSelector("sampleName"), suffix=".merged", extension=".bam", ), position=10, prefix="-O", doc="SAM/BAM file to write merged result to", ), *self.additional_args, ]
def test_add_non_scatter2(self): w = WorkflowBuilder("scatterededge") w.input("inp", Array(String())) w.step("stp", ArrayTestTool(inputs=w.inp)) e = first_value(w.stp.sources["inputs"].source_map) self.assertFalse(e.scatter)
def inputs(self): return [ *super().inputs(), *Gatk4GetPileUpSummariesBase.additional_args, ToolInput( "bam", Array(BamBai()), prefix="-I", prefix_applies_to_all_elements=True, doc="The SAM/BAM/CRAM file containing reads.", position=0, ), ToolInput( "sites", VcfTabix(), prefix="-V", doc="sites of common biallelic variants", ), ToolInput( "intervals", VcfTabix(optional=True), prefix="--intervals", doc= "-L (BASE) One or more genomic intervals over which to operate", ), ToolInput("pileupTableOut", Filename(extension=".txt"), position=1, prefix="-O"), ]
def inputs(self): return [ ToolInput( "inp_files", Array(File), position=4, ), ToolInput( "inp_files2", Array(File), position=5, ), ToolInput( "output_dir", String(optional=True), default="output_dir", position=8 ), ]
def inputs(self): return [ *self.additional_inputs, ToolInput( "bam", Array(Bam), position=10, doc= "A list of SAM or BAM format files. They can be either name or location sorted. If no files provided, <stdin> input is expected. Location-sorted paired-end reads are automatically sorted by read names.", ), ToolInput( "outputFilename", Filename(extension=".txt"), prefix="-o", doc= "Name of output file including read counts. A separate file including summary statistics of counting results is also included in the output ('<string>.summary'). Both files are in tab delimited format.", ), ToolInput( "annotationFile", File, prefix="-a", doc= "Name of an annotation file. GTF/GFF format by default. See -F option for more format information. Inbuilt annotations (SAF format) is available in 'annotation' directory of the package. Gzipped file is also accepted.", ), ]
def constructor(self): self.input("input_bam", BamBai) self.input("ref_fasta", FastaWithIndexes) self.input("intervals", Array(Bed)) self.step( "haplotype_caller", Gatk4HaplotypeCaller_4_1_4( inputRead=self.input_bam, reference=self.ref_fasta, intervals=self.intervals, gvcfGqBands=[10, 20, 30, 40, 50, 60, 70, 80, 90], contaminationFractionToFilter=0.0, annotationGroup=[ "StandardAnnotation", "StandardHCAnnotation", # "AS_StandardAnnotation", ], ), scatter="intervals", ) self.step("merge", Gatk4MergeVcfs_4_1_4(vcfs=self.haplotype_caller.out)) self.output("output_vcf", source=self.merge.output_vcf)
def inputs(self): return [ *super(SamToolsViewBase, self).inputs(), *SamToolsViewBase.additional_inputs, ToolInput("sam", UnionType(Sam(), Bam(), Cram()), position=10), ToolInput( "reference", FastaWithDict(optional=True), position=6, prefix="-T", doc= "A FASTA format reference FILE, optionally compressed by bgzip and ideally indexed " "by samtools faidx. If an index is not present, one will be generated for you.", ), ToolInput( "outputFilename", Filename( prefix=InputSelector("sam", remove_file_extension=True), extension=".bam", ), position=5, prefix="-o", doc="Output to FILE [stdout].", ), ToolInput( "regions", Array(String, optional=True), position=11, doc= "Region specifications after the input filename to restrict output to only those alignments which " "overlap the specified region(s). Use of region specifications requires a coordinate-sorted and " "indexed input file (in BAM or CRAM format)", ), ]
def inputs(self): return [ ToolInput("bams", Array(Bam()), position=10), ToolInput("reference", FastaWithDict(), position=1, prefix="--reference"), ToolInput( "outputFilename", Filename(suffix=".svs", extension=".vcf"), position=2, prefix="--output", ), ToolInput( "assemblyFilename", Filename(suffix=".assembled", extension=".bam"), position=3, prefix="--assembly", ), ToolInput("threads", Int(optional=True), default=CpuSelector(), prefix="--threads"), ToolInput("blacklist", Bed(optional=True), position=4, prefix="--blacklist"), ToolInput("tmpdir", String(optional=True), default="./TMP", prefix="--workingdir"), ]
def tests(self): remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data" return [ TTestCase( name="basic", input={ "reads": [ f"{remote_dir}/NA12878-BRCA1_R1.fastq.gz", f"{remote_dir}/NA12878-BRCA1_R2.fastq.gz", ], "threads": 1, }, output=FastqGzPair.basic_test("out", 824000, 408000, 416000) + Array.array_wrapper([TextFile.basic_test( "datafile", 81000, )]), ), TTestCase( name="minimal", input={ "reads": [ f"{remote_dir}/NA12878-BRCA1_R1.fastq.gz", f"{remote_dir}/NA12878-BRCA1_R2.fastq.gz", ], "threads": 1, }, output=self.minimal_test(), ), ]
def inputs(self): return [ *super(Gatk4DepthOfCoverageBase, self).inputs(), ToolInput( "bam", BamBai(), prefix="-I", doc="The SAM/BAM/CRAM file containing reads.", secondaries_present_as={".bai": "^.bai"}, ), ToolInput( "reference", FastaWithDict(), prefix="-R", doc="Reference sequence" ), ToolInput( "outputPrefix", String(), prefix="-O", doc="An output file created by the walker. Will overwrite contents if file exists", ), ToolInput( "intervals", Array(Bed), prefix="--intervals", doc="-L (BASE) One or more genomic intervals over which to operate", prefix_applies_to_all_elements=True, ), *self.additional_args, ]
def constructor(self): self.input("reads", Array(FastqGz)) kwargs_to_process = { "casva", "nano", "nofilter", "noextract", "nogroup", "format", "contaminants", "adapters", "limits", "kmers", } tins: Dict[str, ToolInput] = { i.id(): i for i in fastqc_single_instantiated.inputs() } kwargs = { i.id(): self.input(i.id(), i.input_type, doc=i.doc) for i in tins.values() if i.id() in kwargs_to_process } self.step( "fastqc", FastQCSingleLatest(read=self.reads, **kwargs), scatter="read", ) self.capture_outputs_from_step(self.fastqc)
def inputs(self): return [ ToolInput( "ubam", BamBai(), prefix="--UNMAPPED_BAM", prefix_applies_to_all_elements=True, doc= "Original SAM or BAM file of unmapped reads, which must be in queryname order.", position=10, ), ToolInput( "bam", Array(BamBai()), prefix="--ALIGNED_BAM", prefix_applies_to_all_elements=True, doc="SAM or BAM file(s) with alignment data.", position=10, ), ToolInput( "reference", FastaWithDict(optional=True), prefix="--REFERENCE_SEQUENCE", position=10, doc="Reference sequence file.", ), ToolInput( "outputFilename", Filename(extension=".bam"), position=10, prefix="--OUTPUT", doc="Merged SAM or BAM file to write to.", ), *self.additional_args, ]
def inputs(self): # Would be good to include this in the prefix: # If(InputSelector("bam").length().equals(1), InputSelector("bam")[0].basename(), None) prefix = FirstOperator([InputSelector("outputPrefix"), "generated"]) return [ ToolInput( "bam", Array(Bam), prefix="-I", position=10, # secondaries_present_as={".bai": "^.bai"}, doc= "One or more input SAM or BAM files to analyze. Must be coordinate sorted.", ), ToolInput("outputPrefix", String(optional=True)), ToolInput( "outputFilename", Filename(prefix=prefix, suffix=".markduped", extension=".bam"), position=10, prefix="-O", doc="File to write duplication metrics to", ), ToolInput( "metricsFilename", Filename(prefix=prefix, suffix=".metrics", extension=".txt"), position=10, prefix="-M", doc="The output file to write marked records to.", ), *super().inputs(), *self.additional_args, ]
def constructor(self): self.input("bams", Array(BamBai())) self.input("createIndex", Boolean, default=True) self.input("maxRecordsInRam", Int, default=5000000) self.input("sampleName", String(optional=True)) self.step( "mergeSamFiles", Gatk4MergeSamFiles_4_1_2( bams=self.bams, useThreading=True, createIndex=self.createIndex, maxRecordsInRam=self.maxRecordsInRam, validationStringency="SILENT", sampleName=self.sampleName, ), ) self.step( "markDuplicates", Gatk4MarkDuplicates_4_1_2( bam=self.mergeSamFiles.out, createIndex=self.createIndex, maxRecordsInRam=self.maxRecordsInRam, ), ) self.output("out", source=self.markDuplicates.out)
def inputs(self): return [ *super(Gatk4LearnReadOrientationModelBase, self).inputs(), *Gatk4LearnReadOrientationModelBase.additional_args, ToolInput( "f1r2CountsFiles", Array(TarFileGz), position=0, prefix="-I", prefix_applies_to_all_elements=True, doc="Counts for the read orientation of fragments", ), ToolInput( "numEmIterations", Int(optional=True), position=1, prefix="--num-em-iterations", default=30, # Sebastian thinks this is best doc="Amount of iterations for the em process before it bails", ), ToolInput("modelFileOut", Filename(extension=".tar.gz"), position=3, prefix="-O"), ]
def inputs(self): return [ ToolInput( "bam", Array(Bam), prefix="-I", position=10, # secondaries_present_as={".bai": "^.bai"}, doc= "One or more input SAM or BAM files to analyze. Must be coordinate sorted.", ), ToolInput( "outputFilename", Filename( prefix="generated", suffix=".markduped", extension=".bam", ), position=10, prefix="-O", doc="File to write duplication metrics to", ), ToolInput( "metricsFilename", Filename(extension=".metrics.txt"), position=10, prefix="-M", doc="The output file to write marked records to.", ), *super().inputs(), *self.additional_args, ]