Ejemplo n.º 1
0
 def inputs(self):
     return [
         ToolInput("vcf", CompressedVcf, position=1, localise_file=True),
         ToolInput(
             tag="csi",
             input_type=Boolean(optional=True),
             prefix="--csi",
             doc=
             "(-c) generate CSI-format index for VCF/BCF files [default]",
         ),
         ToolInput(
             tag="force",
             input_type=Boolean(optional=True),
             prefix="--force",
             doc="(-f) overwrite index if it already exists",
         ),
         ToolInput(
             tag="minShift",
             input_type=Int(optional=True),
             prefix="--min-shift",
             doc=
             "(-m) set minimal interval size for CSI indices to 2^INT [14]",
         ),
         # ToolInput(
         #     tag="outputFilename",
         #     input_type=Filename(suffix=".indexed", extension=".vcf.gz"),
         #     prefix="--output-file",
         #     doc="(-o) optional output index file name",
         # ),
         ToolInput(
             tag="tbi",
             input_type=Boolean(optional=True),
             default=True,
             prefix="--tbi",
             doc="(-t) generate TBI-format index for VCF files",
         ),
         ToolInput(
             tag="threads",
             input_type=Int(optional=True),
             default=CpuSelector(),
             prefix="--threads",
             doc="sets the number of threads [0]",
         ),
         ToolInput(
             tag="nrecords",
             input_type=Boolean(optional=True),
             prefix="--nrecords",
             doc="(-n) print number of records based on existing index file",
         ),
         ToolInput(
             tag="stats",
             input_type=Boolean(optional=True),
             prefix="--stats",
             doc="(-s) print per contig stats based on existing index file",
         ),
     ]
    def constructor(self):

        self.input("bam", BamBai)
        self.input("intervals", Bed)
        self.input("sample_name", String)
        self.input("header_lines", File)
        self.input("reference", FastaWithDict)

        # vardict options
        self.input("allele_freq_threshold", Float, default=0.05)
        self.input("min_mapping_qual", Int(optional=True))
        self.input("filter", String(optional=True))
        self.input("no_sv_call", Boolean(optional=True))

        self.step(
            "vardict",
            VarDictGermline_1_6_0(
                intervals=self.intervals,
                bam=self.bam,
                reference=self.reference,
                sampleName=self.sample_name,
                var2vcfSampleName=self.sample_name,
                alleleFreqThreshold=self.allele_freq_threshold,
                var2vcfAlleleFreqThreshold=self.allele_freq_threshold,
                vcfFormat=True,
                chromColumn=1,
                regStartCol=2,
                geneEndCol=3,
                threads=4,
                minMappingQual=self.min_mapping_qual,
                filter=self.filter,
                noStructuralVariants=self.no_sv_call,
            ),
        )
        self.step(
            "annotate",
            BcfToolsAnnotate_1_5(vcf=self.vardict.out,
                                 headerLines=self.header_lines),
        )
        self.step("compressvcf",
                  BGZipLatest(file=self.annotate.out, stdout=True))
        self.step("tabixvcf", TabixLatest(inp=self.compressvcf.out))

        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.annotate.out, reference=self.reference),
        )
        self.step("trim", TrimIUPAC_0_0_5(vcf=self.splitnormalisevcf.out))
        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.trim.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("variants", source=self.tabixvcf.out)
        self.output("out", source=self.filterpass.out)
Ejemplo n.º 3
0
 def inputs(self):
     return [
         *super(Gatk4LearnReadOrientationModelBase, self).inputs(),
         *Gatk4LearnReadOrientationModelBase.additional_args,
         ToolInput(
             "f1r2CountsFiles",
             Array(TarFileGz),
             position=0,
             prefix="-I",
             prefix_applies_to_all_elements=True,
             doc="Counts for the read orientation of fragments",
         ),
         ToolInput(
             "numEmIterations",
             Int(optional=True),
             position=1,
             prefix="--num-em-iterations",
             default=30,  # Sebastian thinks this is best
             doc="Amount of iterations for the em process before it bails",
         ),
         ToolInput("modelFileOut",
                   Filename(extension=".tar.gz"),
                   position=3,
                   prefix="-O"),
     ]
Ejemplo n.º 4
0
    def inputs(self):
        return [
            ToolInput("reference", Fasta, position=1, localise_file=True),
            # ToolInput(
            #     "prefix",
            #     String(optional=True),
            #     prefix="-p",
            #     doc="prefix of the index [same as fasta name]",
            # ),
            ToolInput(
                "blockSize",
                Int(optional=True),
                prefix="-b",
                doc="block size for the bwtsw algorithm (effective with -a bwtsw) [10000000]",
            ),
            ToolInput(
                "algorithm",
                String(optional=True),
                prefix="-a",
                doc="""\
BWT construction algorithm: bwtsw, is or rb2 [auto]
    - is	IS linear-time algorithm for constructing suffix array. It requires 5.37N memory where N is the size of the database. IS is moderately fast, but does not work with database larger than 2GB. IS is the default algorithm due to its simplicity. The current codes for IS algorithm are reimplemented by Yuta Mori.
    - bwtsw	Algorithm implemented in BWT-SW. This method works with the whole human genome.
""",
            ),
        ]
Ejemplo n.º 5
0
 def inputs(self):
     return [
         ToolInput(
             "kmer_size",
             Int(optional=True),
             prefix="-k",
             position=1,
             doc="k-mer (odd) length (default: 31, max value: 31)",
         ),
         ToolInput(
             "index",
             Filename(extension=".kidx"),
             prefix="-i",
             position=2,
             doc="Filename for the kallisto index to be constructed",
         ),
         ToolInput(
             "reference",
             Fasta,
             position=3,
             localise_file=True,
             doc="Filename for a reference transcriptome",
         ),
         # --make-unique           Replace repeated target names with unique names
     ]
Ejemplo n.º 6
0
 def inputs(self):
     return [
         ToolInput("bams", Array(Bam()), position=10),
         ToolInput("reference",
                   FastaWithDict(),
                   position=1,
                   prefix="--reference"),
         ToolInput(
             "outputFilename",
             Filename(suffix=".svs", extension=".vcf"),
             position=2,
             prefix="--output",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembled", extension=".bam"),
             position=3,
             prefix="--assembly",
         ),
         ToolInput("threads",
                   Int(optional=True),
                   default=CpuSelector(),
                   prefix="--threads"),
         ToolInput("blacklist",
                   Bed(optional=True),
                   position=4,
                   prefix="--blacklist"),
         ToolInput("tmpdir",
                   String(optional=True),
                   default="./TMP",
                   prefix="--workingdir"),
     ]
    def add_vardict_variantcaller(self, bam_source):
        self.input(
            "allele_freq_threshold",
            Float,
            0.05,
        ),
        self.input("minMappingQual", Int(optional=True))
        self.input("filter", String(optional=True))
        # Vardict
        self.step(
            "generate_vardict_headerlines",
            GenerateVardictHeaderLines(reference=self.reference),
        )
        self.step(
            "vc_vardict",
            VardictGermlineVariantCaller(
                bam=bam_source,
                reference=self.reference,
                intervals=self.vardict_intervals,
                sample_name=self.sample_name,
                allele_freq_threshold=self.allele_freq_threshold,
                header_lines=self.generate_vardict_headerlines.out,
                minMappingQual=self.minMappingQual,
                filter=self.filter,
            ),
            scatter="intervals",
        )
        self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out))
        self.step(
            "vc_vardict_compress_for_sort",
            BGZipLatest(file=self.vc_vardict_merge.out.as_type(Vcf)),
        )
        self.step(
            "vc_vardict_sort_combined",
            BcfToolsSort_1_9(
                vcf=self.vc_vardict_compress_for_sort.out.as_type(CompressedVcf)
            ),
        )

        self.step(
            "vc_vardict_uncompress_for_combine",
            UncompressArchive(file=self.vc_vardict_sort_combined.out),
        )

        self.output(
            "out_variants_vardict",
            source=self.vc_vardict_sort_combined.out,
            output_folder=["variants"],
            output_name="vardict",
            doc="Merged variants from the VarDict caller",
        )
        self.output(
            "out_variants_vardict_split",
            source=self.vc_vardict.out,
            output_folder=["variants", "vardict"],
            doc="Unmerged variants from the VarDict caller (by interval)",
        )
 def inputs(self):
     return [
         ToolInput("tumor_name", String, position=1),
         ToolInput("normal_name", String, position=2),
         ToolInput("facets_file", File, position=3),
         ToolInput("sv_file", CompressedVcf, position=4),
         ToolInput("output_dir", String(optional=True), default=".", position=5),
         ToolInput("manta_filter", Int(optional=True), position=6),
     ]
Ejemplo n.º 9
0
    def test_tool_input_value_default_cpuselect(self):
        ti = ToolInput("threads", Int(), default=CpuSelector(), prefix="-t")
        tid = {"threads": ti}

        tr = wdl.translate_command_input(ti)
        self.assertEqual(
            "-t ~{select_first([threads, select_first([runtime_cpu, 1])])}",
            tr.get_string(),
        )
Ejemplo n.º 10
0
    def constructor(self):
        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("normal_name", String)
        self.input("tumor_name", String)
        self.input("snps_dbsnp", File)

        # optional
        self.input("pseudo_snps", Int(optional=True))
        self.input("max_depth", Int(optional=True))
        self.input("everything", Boolean(optional=True))
        self.input("genome", String(optional=True))
        self.input("cval", Int(optional=True))
        self.input("purity_cval", Int(optional=True))
        self.input("normal_depth", Int(optional=True))

        self.add_snp_pileup()
        self.add_run_facets()
Ejemplo n.º 11
0
 def inputs(self):
     return [
         *StarAlignerBase.additional_inputs,
         ToolInput("help", Boolean(optional=True), prefix="--help", doc="help page"),
         ToolInput(
             "runThreadN",
             Int(optional=True),
             default=CpuSelector(),
             prefix="--runThreadN",
             doc="int: number of threads to run STAR. Default: 1.",
         ),
         ToolInput(
             "genomeDir",
             Directory(optional=True),
             prefix="--genomeDir",
             doc="string: path to the directory where genome files are stored (for –runMode alignReads) or will be generated (for –runMode generateGenome). Default: ./GenomeDir",
         ),
         ToolInput(
             "readFilesIn",
             Array(FastqGz, optional=True),
             prefix="--readFilesIn",
             separator=",",
             doc="string(s): paths to files that contain input read1 (and, if needed, read2). Default: Read1,Read2.",
         ),
         ToolInput(
             "outFileNamePrefix",
             Filename(),
             prefix="--outFileNamePrefix",
             doc="string: output files name prefix (including full or relative path). Can only be defined on the command line.",
         ),
         ToolInput(
             "outSAMtype",
             Array(String(), optional=True),
             prefix="--outSAMtype",
             separator=" ",
             prefix_applies_to_all_elements=False,
             doc='strings: type of SAM/BAM output. 1st word: "BAM": outputBAMwithoutsorting, "SAM": outputSAMwithoutsorting, "None": no SAM/BAM output. 2nd,3rd: "Unsorted": standard unsorted. "SortedByCoordinate": sorted by coordinate. This option will allocate extra memory for sorting which can be specified by –limitBAMsortRAM.',
         ),
         ToolInput(
             "outSAMunmapped",
             String(optional=True),
             prefix="--outSAMunmapped",
             doc="string(s): output of unmapped reads in the SAM format",
         ),
         ToolInput(
             "outSAMattributes",
             String(optional=True),
             prefix="--outSAMattributes",
             doc="string: a string of desired SAM attributes, in the order desired for the output SAM",
         ),
         ToolInput(
             "readFilesCommand",
             String(optional=True),
             prefix="--readFilesCommand",
             doc="string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout",
         ),
     ]
Ejemplo n.º 12
0
    def test_tool_input_value_default_cpuselect_nodefault(self):
        ti = ToolInput("threads", Int(), default=CpuSelector(None), prefix="-t")
        tid = {"threads": ti}

        tr = wdl.translate_command_input(ti, tid)
        self.assertEqual(
            '${"-t " + if defined(threads) then threads else runtime_cpu}',
            tr.get_string(),
        )
Ejemplo n.º 13
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".combined"),
             prefix="-o",
         ),
         # deprecated
         # ToolInput(
         #     "regions",
         #     Filename(extension=".tsv"),
         #     prefix="--regions",
         #     doc="Region file containing all the variants, used as samtools mpileup",
         # ),
         ToolInput(
             "vcfs",
             Array(Vcf()),
             prefix="-i",
             prefix_applies_to_all_elements=True,
             doc=
             "input vcfs, the priority of the vcfs will be based on the order of the input",
         ),
         ToolInput("type",
                   String(),
                   prefix="--type",
                   doc="germline | somatic"),
         ToolInput(
             "columns",
             Array(String(), optional=True),
             prefix="--columns",
             separator=",",
             doc="Columns to keep, seperated by space output vcf (unsorted)",
         ),
         ToolInput(
             "normal",
             String(optional=True),
             prefix="--normal",
             doc=
             "Sample id of germline vcf, or normal sample id of somatic vcf",
         ),
         ToolInput(
             "tumor",
             String(optional=True),
             prefix="--tumor",
             doc="tumor sample ID, required if inputs are somatic vcfs",
         ),
         ToolInput(
             "priority",
             Int(optional=True),
             prefix="--priority",
             doc=
             "The priority of the callers, must match with the callers in the source header",
         ),
     ]
Ejemplo n.º 14
0
 def test_input_value_cpuselect_stringenv(self):
     # CpuSelector relies on their being a runtime_cpu attribute,
     # this test will assume it's present, and '' will test ensure
     # that it is actually present
     ti = {"runtime_cpu": ToolInput("runtime_cpu", Int(), default=1)}
     inp = CpuSelector()
     self.assertEqual(
         "~{select_first([runtime_cpu, 1])}",
         wdl.get_input_value_from_potential_selector_or_generator(
             inp, ti, string_environment=True),
     )
Ejemplo n.º 15
0
 def inputs(self):
     return [
         ToolInput("bam", Bam(), position=10),
         ToolInput(
             "threads",
             Int(optional=True),
             position=5,
             prefix="-@",
             doc=
             "Number of BAM compression threads to use in addition to main thread [0].",
         ),
     ]
Ejemplo n.º 16
0
 def inputs(self):
     return [
         ToolInput("javaOptions", Array(String, optional=True)),
         ToolInput(
             "compression_level",
             Int(optional=True),
             doc=
             "Compression level for all compressed files created (e.g. BAM and VCF). Default value: 2.",
         )
         # ToolInput("pg-tag", Boolean(optional=True), prefix="--add-output-sam-program-record",
         #           doc="If true, adds a PG tag to created SAM/BAM/CRAM files.")
     ]
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("normal_name", String)
        self.input("tumor_name", String)
        self.input("intervals", Bed)
        self.input("header_lines", File)
        self.input("reference", FastaWithDict)

        # vardict options
        self.input("allele_freq_threshold", Float(), 0.05)
        self.input("minMappingQual", Int(optional=True))
        self.input("filter", String(optional=True))

        self.step(
            "vardict",
            VarDictSomatic_1_6_0(
                normalBam=self.normal_bam,
                tumorBam=self.tumor_bam,
                intervals=self.intervals,
                reference=self.reference,
                normalName=self.normal_name,
                tumorName=self.tumor_name,
                alleleFreqThreshold=self.allele_freq_threshold,
                vcfFormat=True,
                chromColumn=1,
                regStartCol=2,
                geneEndCol=3,
                threads=4,
                minMappingQual=self.minMappingQual,
                filter=self.filter,
            ),
        )
        self.step(
            "annotate",
            BcfToolsAnnotate_1_5(vcf=self.vardict.out,
                                 headerLines=self.header_lines),
        )
        self.step("compressvcf",
                  BGZipLatest(file=self.annotate.out, stdout=True))
        self.step("tabixvcf", TabixLatest(inp=self.compressvcf.out))

        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.annotate.out, reference=self.reference),
        )
        self.step("trim", TrimIUPAC_0_0_5(vcf=self.splitnormalisevcf.out))
        self.step("filterpass", FilterVardictSomaticVcf(vcf=self.trim.out))

        self.output("variants", source=self.tabixvcf.out)
        self.output("out", source=self.filterpass.out)
Ejemplo n.º 18
0
 def inputs(self):
     return [
         *super(SamToolsIndexBase, self).inputs(),
         *SamToolsIndexBase.additional_inputs,
         ToolInput("bam", Bam, position=10, localise_file=True),
         ToolInput(
             "threads",
             Int(optional=True),
             prefix="-@",
             default=CpuSelector(),
             position=10,
         ),
     ]
Ejemplo n.º 19
0
 def inputs(self):
     return [
         ToolInput(
             "runFolderDir",
             input_type=Directory(),
             prefix="-R",
             doc="path to runfolder directory",
         ),
         ToolInput(
             "sampleSheet",
             input_type=Csv(),
             prefix="--sample-sheet",
             doc="path to the sample sheet",
         ),
         ToolInput(
             "loadingThreads",
             input_type=Int(),
             prefix="-r",
             default=4,
             doc="number of threads used for loading BCL data",
         ),
         ToolInput(
             "processingThreads",
             input_type=Int(),
             prefix="-p",
             default=4,
             doc="number of threads used for processing demultiplexed data",
         ),
         ToolInput(
             "writingThreads",
             input_type=Int(),
             prefix="-w",
             default=4,
             doc="number of threads used for writing FASTQ data",
         ),
         *Bcl2FastqBase.additional_inputs,
     ]
Ejemplo n.º 20
0
 def inputs(self):
     return [
         ToolInput("vcf", CompressedVcf, position=3),
         ToolInput("rate",
                   Float,
                   prefix="-t",
                   doc="base sampling probability per locus"),
         ToolInput(
             "scaleBy",
             String(optional=True),
             prefix="-s",
             doc="scale sampling likelihood by this Float info field",
         ),
         ToolInput("seed", Int(), prefix="-p", doc="use this random seed"),
     ]
Ejemplo n.º 21
0
 def inputs(self):
     return [
         ToolInput(
             "listFile",
             File(optional=True),
             prefix="--list",
             doc=
             "List file: A tsv file contains SampleName\tPathToBedtoolsOutput on each line",
         ),
         ToolInput(
             "sampleName",
             String(optional=True),
             prefix="--name",
             doc="Sample name if list not used",
         ),
         ToolInput(
             "bedtoolsOutputPath",
             File(optional=True),
             prefix="--path",
             doc="Path to bedtools output if list not used",
         ),
         ToolInput(
             "outputGeneFile",
             Filename(extension=".txt", suffix=".gene"),
             prefix="--gene",
             doc="Output gene file",
         ),
         ToolInput(
             "outputRegionFile",
             Filename(extension=".txt", suffix=".region"),
             prefix="--region",
             doc="Output region file",
         ),
         ToolInput(
             "fold",
             String(optional=True),
             prefix="--fold",
             doc="Folds, quoted and commna sepparated, default 1,10,20,100",
         ),
         ToolInput(
             "threads",
             Int(optional=True),
             prefix="--threads",
             doc="number of threads, default:32",
         ),
     ]
Ejemplo n.º 22
0
 def inputs(self):
     return [
         ToolInput("vcf", Vcf, position=3),
         ToolInput(
             "inMemoryFlag",
             Boolean(optional=True),
             prefix="-a",
             default=False,
             doc="load all sites and then sort in memory",
         ),
         ToolInput(
             "windowSize",
             Int(optional=True),
             prefix="-w",
             doc="number of sites to sort (default 10000)",
         ),
     ]
Ejemplo n.º 23
0
 def inputs(self):
     return [
         ToolInput("vcf", CompressedVcf, position=3),
         ToolInput(
             "truth",
             CompressedVcf(),
             prefix="-t",
             doc="use this VCF as ground truth for ROC generation",
         ),
         ToolInput(
             "windowSize",
             Int(optional=True),
             prefix="-w",
             default=30,
             doc="compare records up to this many bp away (default 30)",
         ),
         ToolInput("reference",
                   FastaWithDict,
                   prefix="-r",
                   doc="FASTA reference file"),
     ]
Ejemplo n.º 24
0
 def inputs(self):
     return [
         ToolInput("vcf", CompressedVcf, position=3),
         ToolInput(
             "useMnpsFlag",
             Boolean(optional=True),
             prefix="-m",
             default=False,
             doc="Retain MNPs as separate events (default: false)",
         ),
         ToolInput(
             "tagParsed",
             String(optional=True),
             prefix="-t",
             doc=
             "Tag records which are split apart of a complex allele with this flag",
         ),
         ToolInput(
             "keepInfoFlag",
             Boolean(optional=True),
             prefix="-k",
             doc=
             "Maintain site and allele-level annotations when decomposing. Note that in many cases, such as multisample VCFs, these won't be valid post-decomposition.  For biallelic loci in single-sample VCFs, they should be usable with caution.",
         ),
         ToolInput(
             "keepGenoFlag",
             Boolean(optional=True),
             prefix="-g",
             doc=
             "Maintain genotype-level annotations when decomposing.  Similar caution should be used for this as for --keep-info.",
         ),
         ToolInput(
             "maxLength",
             Int(optional=True),
             prefix="-L",
             doc=
             "Do not manipulate records in which either the ALT or REF is longer than LEN (default: 200).",
         ),
     ]
Ejemplo n.º 25
0
    def constructor(self):

        self.input("reference", Fasta)

        # Change the default BWA index algorithm to bwtsw (for human genome), and up blockSize to 50M
        self.input("bwa_algorithm", String(optional=True), default="bwtsw")
        self.input("bwa_block_size", Int(optional=True), default=int(5e7))

        self.step(
            "create_bwa",
            BwaIndexLatest(
                reference=self.reference,
                algorithm=self.bwa_algorithm,
                blockSize=self.bwa_block_size,
            ),
        )
        self.step("create_samtools",
                  SamToolsFaidxLatest(reference=self.reference))
        self.step(
            "create_dict",
            Gatk4CreateSequenceDictionaryLatest(reference=self.reference))

        self.step(
            "merge",
            _JoinIndexedFasta(
                ref_bwa=self.create_bwa,
                ref_samtools=self.create_samtools,
                ref_dict=self.create_dict,
            ),
        )

        self.output("out_reference", source=self.merge.out_reference)
        self.output("out_bwa", source=self.create_bwa, output_name="reference")
        self.output("out_samtools",
                    source=self.create_samtools,
                    output_name="reference")
        self.output("out_dict",
                    source=self.create_dict,
                    output_name="reference")
Ejemplo n.º 26
0
 def inputs(self):
     return [
         ToolInput("reference", Fasta, position=1, localise_file=True),
         # ToolInput(
         #     "prefix",
         #     String(optional=True),
         #     prefix="-p",
         #     doc="prefix of the index [same as fasta name]",
         # ),
         ToolInput(
             "blockSize",
             Int(optional=True),
             prefix="-b",
             doc=
             "block size for the bwtsw algorithm (effective with -a bwtsw) [10000000]",
         ),
         ToolInput(
             "algorithm",
             String(optional=True),
             prefix="-a",
             doc="BWT construction algorithm: bwtsw, is or rb2 [auto]",
         ),
     ]
Ejemplo n.º 27
0
 def inputs(self):
     return [
         *super(SeqzBinningBase, self).inputs(),
         ToolInput("seqz",
                   File(),
                   prefix="--seqz",
                   position=2,
                   doc="A seqz file."),
         ToolInput(
             "window",
             Int(),
             prefix="--window",
             position=4,
             doc=
             "Window size used for binning the original seqz file. Default is 50.",
         ),
         ToolInput(
             "output_filename",
             Filename(extension=".gz"),
             prefix="-o",
             position=6,
             doc='Output file "-" for STDOUT',
         ),
     ]
 def add_inputs_for_configuration(self):
     super().add_inputs_for_configuration()
     # facets
     self.input("pseudo_snps", Int(optional=True))
     self.input("max_depth", Int(optional=True))
     self.input("everything", Boolean(optional=True))
     self.input("genome", String(optional=True))
     self.input("cval", Int(optional=True))
     self.input("purity_cval", Int(optional=True))
     self.input("normal_depth", Int(optional=True))
     # vardict
     self.input(
         "allele_freq_threshold",
         Float,
         default=0.05,
         doc=InputDocumentation(
             "The threshold for VarDict's allele frequency, default: 0.05 or 5%",
             quality=InputQualityType.configuration,
         ),
     )
     self.input("minMappingQual", Int(optional=True))
     self.input("filter", String(optional=True))
Ejemplo n.º 29
0
    def constructor(self):

        self.input("bams", Array(CramCrai))

        self.input("reference", FastaFai)
        self.input("regionSize", int, default=10000000)

        self.input("normalSample", String)
        self.input("sampleNames", Array(String, optional=True))

        # for the moment this is a bit wonky, because you need to specify something which is
        # affected by the amount of bams that you specify (bam coverage just gets summed up at this
        # location)
        # so the formula at the moment would be nBams * coverage = skipCov
        # which means for 8 bams with an average coverage of 160 you would probably want
        # 8 * 400 = 1600 to be on the save side
        self.input("skipCov", Int(optional=True), default=500)

        # the same is true for min cov
        self.input("minCov", Int(optional=True), default=10)

        # this should be a conditional (if the callregions are supplied we use them, otherwise we
        # create them)
        self.step(
            "createCallRegions",
            CreateCallRegions(reference=self.reference,
                              regionSize=self.regionSize,
                              equalize=True),
        )

        self.step(
            "callVariants",
            FreeBayes(
                bams=self.bams,
                reference=self.reference,
                pooledDiscreteFlag=True,
                gtQuals=True,
                strictFlag=True,
                pooledContinousFlag=True,
                reportMaxGLFlag=True,
                noABPriorsFlag=True,
                maxNumOfAlleles=4,
                noPartObsFlag=True,
                region=self.createCallRegions.regions,
                skipCov=self.skipCov,
                # things that are actually default, but janis does not recognize yet
                useDupFlag=False,
                minBaseQual=1,
                minSupMQsum=0,
                minSupQsum=0,
                minCov=self.minCov,
                # now here we are trying to play with the detection limits
                # we set the fraction to be very low, to include ALL of the sites in a potential analysis
                minAltFrac=0.01,
                # and we want at least one sample that has two high quality variants OR multiple
                # lower quality ones
                minAltQSum=70,
                # but we also want to have at least two reads overall with that variants
                # we do not care if they are between samples or if they are in the same sample, but
                # 2 is better than one
                minAltTotal=2,
            ),
            scatter="region",
        )
        # might actually rewrite this once everything works, to not combine the files here, but do
        # all of it scattered and then only combine the final output
        # self.step("combineRegions", VcfCombine(vcf=self.callVariants.out))

        #

        # self.step("compressAll", BGZip(file=self.sortAll.out))
        # self.step("indexAll", Tabix(file=self.compressAll.out))

        self.step(
            "callSomatic",
            CallSomaticFreeBayes(vcf=self.callVariants.out,
                                 normalSampleName=self.normalSample),
            # added for parallel
            scatter="vcf",
        )

        self.step("combineRegions", VcfCombine(vcf=self.callSomatic.out))

        # should not be necessary here, but just to be save
        self.step(
            "sortSomatic1",
            VcfStreamSort(vcf=self.combineRegions.out, inMemoryFlag=True),
        )

        # no need to compress this here if it leads to problems when we dont have an index for the allelic allelicPrimitves
        self.step(
            "normalizeSomatic1",
            BcfToolsNorm(
                vcf=self.sortSomatic1.out,
                reference=self.reference,
                outputType="v",
                outputFilename="normalised.vcf",
            ),
        )

        self.step(
            "allelicPrimitves",
            VcfAllelicPrimitives(
                vcf=self.normalizeSomatic1.out,
                tagParsed="DECOMPOSED",
                keepGenoFlag=True,
            ),
        )

        self.step("fixSplitLines", VcfFixUp(vcf=self.allelicPrimitves.out))

        self.step("sortSomatic2",
                  VcfStreamSort(vcf=self.fixSplitLines.out, inMemoryFlag=True))

        self.step(
            "normalizeSomatic2",
            BcfToolsNorm(
                vcf=self.sortSomatic2.out,
                reference=self.reference,
                outputType="v",
                outputFilename="normalised.vcf",
            ),
        )

        self.step("uniqueAlleles",
                  VcfUniqAlleles(vcf=self.normalizeSomatic2.out))

        self.step("sortFinal",
                  VcfStreamSort(vcf=self.uniqueAlleles.out, inMemoryFlag=True))

        self.step("uniqVcf", VcfUniq(vcf=self.sortFinal.out))

        self.step("compressFinal", BGZip(file=self.uniqVcf.out))

        self.step("indexFinal", Tabix(inp=self.compressFinal.out))

        self.output("somaticOutVcf", source=self.indexFinal)
Ejemplo n.º 30
0
class BwaMem_SamToolsView(BioinformaticsTool):
    def tool(self) -> str:
        return "BwaMemSamtoolsView"

    def tool_provider(self):
        return "common"

    def version(self):
        return "0.7.17|1.9"

    def container(self):
        return "michaelfranklin/bwasamtools:0.7.17-1.9"

    def base_command(self):
        return None

    def arguments(self):
        return [
            ToolArgument("bwa", position=0, shell_quote=False),
            ToolArgument("mem", position=1, shell_quote=False),
            ToolArgument("|", position=5, shell_quote=False),
            ToolArgument("samtools", position=6, shell_quote=False),
            ToolArgument("view", position=7, shell_quote=False),
            ToolArgument(InputSelector("reference"),
                         prefix="-T",
                         position=8,
                         shell_quote=False),
            ToolArgument(
                CpuSelector(),
                position=8,
                shell_quote=False,
                prefix="--threads",
                doc="(-@)  Number of additional threads to use [0]",
            ),
            ToolArgument(
                "-h",
                position=8,
                shell_quote=False,
                doc="Include the header in the output.",
            ),
            ToolArgument("-b",
                         position=8,
                         shell_quote=False,
                         doc="Output in the BAM format."),
            ToolArgument(
                StringFormatter(
                    "@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:{pl}",
                    name=InputSelector("sampleName"),
                    pl=InputSelector("platformTechnology"),
                ),
                prefix="-R",
                position=2,
                doc=
                "Complete read group header line. ’\\t’ can be used in STR and will be converted to a TAB"
                "in the output SAM. The read group ID will be attached to every read in the output. "
                "An example is ’@RG\\tID:foo\\tSM:bar’. (Default=null) "
                "https://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups",
            ),
            ToolArgument(
                CpuSelector(),
                prefix="-t",
                position=2,
                shell_quote=False,
                doc="Number of threads. (default = 1)",
            ),
        ]

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("reference",
                      FastaWithDict(),
                      position=2,
                      shell_quote=False),
            ToolInput("reads",
                      FastqGzPair,
                      position=3,
                      shell_quote=False,
                      doc=None),
            ToolInput(
                "mates",
                FastqGzPair(optional=True),
                separator=" ",
                position=4,
                shell_quote=False,
                doc=None,
            ),
            ToolInput(
                "outputFilename",
                Filename(prefix=InputSelector("sampleName"), extension=".bam"),
                position=8,
                shell_quote=False,
                prefix="-o",
                doc="output file name [stdout]",
            ),
            # Eventually it would be cool to have like a cascading:
            #   - If readGroupHeaderLine provided, use that,
            #   - If sampleName provided, construct based on that
            #   - Else don't include
            # but this is probbaly a bit hard to do, and for all our purposes we require a readGroupHeaderLine,
            # so we're always going to construct it:
            ToolInput(
                "sampleName",
                String(),
                doc="Used to construct the readGroupHeaderLine with format: "
                "'@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:ILLUMINA'",
            ),
            ToolInput(
                "platformTechnology",
                String(optional=True),
                doc=
                "(ReadGroup: PL) Used to construct the readGroupHeaderLine, defaults: ILLUMINA",
                default="ILLUMINA",
            ),
            *self.bwa_additional_inputs,
            *self.samtools_additional_args,
        ]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("out", Bam(), glob=InputSelector("outputFilename")),
            # ToolOutput("skippedReads", File(optional=True), glob=InputSelector("skippedReadsOutputFilename"))
        ]

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BWA_MEM_TUPLE)
        if val:
            return val
        return 16

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BWA_CORES_TUPLE)
        if val:
            return val
        return 16

    def friendly_name(self) -> str:
        return "Bwa mem + Samtools View"

    bwa_additional_inputs = [
        ToolInput(
            "minimumSeedLength",
            Int(optional=True),
            prefix="-k",
            position=2,
            shell_quote=False,
            doc=
            "Matches shorter than INT will be missed. The alignment speed is usually "
            "insensitive to this value unless it significantly deviates 20. (Default: 19)",
        ),
        ToolInput(
            "bandwidth",
            Int(optional=True),
            prefix="-w",
            position=2,
            shell_quote=False,
            doc=
            "Essentially, gaps longer than ${bandWidth} will not be found. Note that the maximum gap length "
            "is also affected by the scoring matrix and the hit length, not solely determined by this option."
            " (Default: 100)",
        ),
        ToolInput(
            "offDiagonalXDropoff",
            Int(optional=True),
            prefix="-d",
            position=2,
            shell_quote=False,
            doc=
            "(Z-dropoff): Stop extension when the difference between the best and the current extension "
            "score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, "
            "respectively, and A is the matching score. Z-dropoff is similar to BLAST’s X-dropoff except "
            "that it doesn’t penalize gaps in one of the sequences in the alignment. Z-dropoff not only "
            "avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. "
            "(Default: 100)",
        ),
        ToolInput(
            "reseedTrigger",
            Float(optional=True),
            prefix="-r",
            position=2,
            shell_quote=False,
            doc=
            "Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter "
            "for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment "
            "speed but lower accuracy. (Default: 1.5)",
        ),
        ToolInput(
            "occurenceDiscard",
            Int(optional=True),
            prefix="-c",
            position=2,
            shell_quote=False,
            doc="Discard a MEM if it has more than INT occurence in the genome. "
            "This is an insensitive parameter. (Default: 10000)",
        ),
        ToolInput(
            "performSW",
            Boolean(optional=True),
            prefix="-P",
            position=2,
            shell_quote=False,
            doc=
            "In the paired-end mode, perform SW to rescue missing hits only but "
            "do not try to find hits that fit a proper pair.",
        ),
        ToolInput(
            "matchingScore",
            Int(optional=True),
            prefix="-A",
            position=2,
            shell_quote=False,
            doc="Matching score. (Default: 1)",
        ),
        ToolInput(
            "mismatchPenalty",
            Int(optional=True),
            prefix="-B",
            position=2,
            shell_quote=False,
            doc=
            "Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. "
            "(Default: 4)",
        ),
        ToolInput(
            "openGapPenalty",
            Int(optional=True),
            prefix="-O",
            position=2,
            shell_quote=False,
            doc="Gap open penalty. (Default: 6)",
        ),
        ToolInput(
            "gapExtensionPenalty",
            Int(optional=True),
            prefix="-E",
            position=2,
            shell_quote=False,
            doc="Gap extension penalty. A gap of length k costs O + k*E "
            "(i.e. -O is for opening a zero-length gap). (Default: 1)",
        ),
        ToolInput(
            "clippingPenalty",
            Int(optional=True),
            prefix="-L",
            position=2,
            shell_quote=False,
            doc=
            "Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score "
            "reaching the end of query. If this score is larger than the best SW score minus the "
            "clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag "
            "reports the best SW score; clipping penalty is not deducted. (Default: 5)",
        ),
        ToolInput(
            "unpairedReadPenalty",
            Int(optional=True),
            prefix="-U",
            position=2,
            shell_quote=False,
            doc=
            "Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as "
            "scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. "
            "It compares these two scores to determine whether we should force pairing. (Default: 9)",
        ),
        ToolInput(
            "assumeInterleavedFirstInput",
            Boolean(optional=True),
            prefix="-p",
            position=2,
            shell_quote=False,
            doc=
            "Assume the first input query file is interleaved paired-end FASTA/Q. ",
        ),
        ToolInput(
            "outputAlignmentThreshold",
            Int(optional=True),
            prefix="-T",
            position=2,
            shell_quote=False,
            doc=
            "Don’t output alignment with score lower than INT. Only affects output. (Default: 30)",
        ),
        ToolInput(
            "outputAllElements",
            Boolean(optional=True),
            prefix="-a",
            position=2,
            shell_quote=False,
            doc=
            "Output all found alignments for single-end or unpaired paired-end reads. "
            "These alignments will be flagged as secondary alignments.",
        ),
        ToolInput(
            "appendComments",
            Boolean(optional=True),
            prefix="-C",
            position=2,
            shell_quote=False,
            doc=
            "Append append FASTA/Q comment to SAM output. This option can be used to transfer "
            "read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment "
            "(the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). "
            "Malformated comments lead to incorrect SAM output.",
        ),
        ToolInput(
            "hardClipping",
            Boolean(optional=True),
            prefix="-H",
            position=2,
            shell_quote=False,
            doc=
            "Use hard clipping ’H’ in the SAM output. This option may dramatically reduce "
            "the redundancy of output when mapping long contig or BAC sequences.",
        ),
        ToolInput(
            "markShorterSplits",
            Boolean(optional=True),
            prefix="-M",
            position=2,
            shell_quote=False,
            doc=
            "Mark shorter split hits as secondary (for Picard compatibility).",
        ),
        ToolInput(
            "verboseLevel",
            Int(optional=True),
            prefix="-v",
            position=2,
            shell_quote=False,
            doc="Control the verbose level of the output. "
            "This option has not been fully supported throughout BWA. Ideally, a value: "
            "0 for disabling all the output to stderr; "
            "1 for outputting errors only; "
            "2 for warnings and errors; "
            "3 for all normal messages; "
            "4 or higher for debugging. When this option takes value 4, the output is not SAM. (Default: 3)",
        ),
    ]

    samtools_additional_args = [
        ToolInput(
            "skippedReadsOutputFilename",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-U",
            doc="output reads not selected by filters to FILE [null]",
        ),
        ToolInput(
            "referenceIndex",
            File(optional=True),
            position=8,
            shell_quote=False,
            prefix="-t",
            doc=
            "FILE listing reference names and lengths (see long help) [null]",
        ),
        ToolInput(
            "intervals",
            Bed(optional=True),
            position=8,
            shell_quote=False,
            prefix="-L",
            doc="only include reads overlapping this BED FILE [null]",
        ),
        ToolInput(
            "includeReadsInReadGroup",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-r",
            doc="only include reads in read group STR [null]",
        ),
        ToolInput(
            "includeReadsInFile",
            File(optional=True),
            position=8,
            shell_quote=False,
            prefix="-R",
            doc="only include reads with read group listed in FILE [null]",
        ),
        ToolInput(
            "includeReadsWithQuality",
            Int(optional=True),
            position=8,
            shell_quote=False,
            prefix="-q",
            doc="only include reads with mapping quality >= INT [0]",
        ),
        ToolInput(
            "includeReadsInLibrary",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-l",
            doc="only include reads in library STR [null]",
        ),
        ToolInput(
            "includeReadsWithCIGAROps",
            Int(optional=True),
            position=8,
            shell_quote=False,
            prefix="-m",
            doc=
            "only include reads with number of CIGAR operations consuming query sequence >= INT [0]",
        ),
        ToolInput(
            "includeReadsWithAllFLAGs",
            Array(Int(), optional=True),
            position=8,
            shell_quote=False,
            prefix="-f",
            separator=" ",
            doc="only include reads with all of the FLAGs in INT present [0]",
        ),
        ToolInput(
            "includeReadsWithoutFLAGs",
            Array(Int(), optional=True),
            position=8,
            shell_quote=False,
            prefix="-F",
            separator=" ",
            doc="only include reads with none of the FLAGS in INT present [0]",
        ),
        ToolInput(
            "excludeReadsWithAllFLAGs",
            Array(Int(), optional=True),
            position=8,
            shell_quote=False,
            prefix="-G",
            separator=" ",
            doc="only EXCLUDE reads with all of the FLAGs in INT present [0] "
            "fraction of templates/read pairs to keep; INT part sets seed)",
        ),
        ToolInput(
            "useMultiRegionIterator",
            Boolean(optional=True),
            position=8,
            shell_quote=False,
            prefix="-M",
            doc="use the multi-region iterator (increases the speed, removes "
            "duplicates and outputs the reads as they are ordered in the file)",
        ),
        ToolInput(
            "readTagToStrip",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-x",
            doc="read tag to strip (repeatable) [null]",
        ),
        ToolInput(
            "collapseBackwardCIGAROps",
            Boolean(optional=True),
            position=8,
            shell_quote=False,
            prefix="-B",
            doc=
            "collapse the backward CIGAR operation Specify a single input file format "
            "option in the form of OPTION or OPTION=VALUE",
        ),
        ToolInput(
            "outputFmt",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="--output-fmt",
            doc=
            "(OPT[, -O)  Specify output format (SAM, BAM, CRAM) Specify a single "
            "output file format option in the form of OPTION or OPTION=VALUE",
        ),
    ]