def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("intervals", Bed(), position=2, shell_quote=False),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".vardict"),
             prefix=">",
             position=6,
             shell_quote=False,
         ),
         ToolInput(
             "bam",
             BamBai(),
             prefix="-b",
             position=1,
             shell_quote=False,
             doc="The indexed BAM file",
         ),
         ToolInput(
             "reference",
             FastaFai(),
             prefix="-G",
             position=1,
             shell_quote=False,
             doc="The reference fasta. Should be indexed (.fai). "
             "Defaults to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa",
         ),
         *VarDictGermlineBase.vardict_inputs,
         *VarDictGermlineBase.var2vcf_inputs,
     ]
Exemple #2
0
 def inputs(self):
     return [
         ToolInput("inputFilename", Bam(), position=200),
         ToolInput(
             "reference", FastaFai(), prefix="-r", doc="Reference sequence file."
         ),
         ToolInput("outputFilename", Filename(extension=".bam")),
         *ScrambleBase.additional_inputs,
     ]
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("tumorBam", BamBai(), doc="The indexed BAM file"),
         ToolInput("normalBam", BamBai(), doc="The indexed BAM file"),
         ToolInput("intervals", Bed(), position=2, shell_quote=False),
         ToolInput(
             "reference",
             FastaFai(),
             prefix="-G",
             position=1,
             shell_quote=False,
             doc="The reference fasta. Should be indexed (.fai). "
             "Defaults to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa",
         ),
         ToolInput(
             "tumorName",
             String(),
             doc=
             "The sample name to be used directly.  Will overwrite -n option",
         ),
         ToolInput(
             "normalName",
             String(),
             doc="The normal sample name to use with the -b option",
         ),
         ToolInput(
             "alleleFreqThreshold",
             Float(optional=True),
             doc="The threshold for allele frequency, default: 0.05 or 5%",
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".vardict"),
             prefix=">",
             position=10,
             shell_quote=False,
         ),
         *VarDictSomaticCompressedBase.vardict_inputs,
         *VarDictSomaticCompressedBase.var2vcf_inputs,
     ]
Exemple #4
0
class BcfToolsNormBase(BcfToolsToolBase, ABC):
    def tool(self):
        return "bcftoolsNorm"

    def friendly_name(self):
        return "BCFTools: Normalize"

    def base_command(self):
        return ["bcftools", "norm"]

    def inputs(self):
        return [
            ToolInput("vcf", CompressedVcf, position=10),
            ToolInput(
                "outputFilename",
                Filename(extension=".vcf.gz"),
                prefix="-o",
                doc="--output: When output consists of a single stream, "
                "write it to FILE rather than to standard output, where it is written by default.",
            ),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput("out",
                       CompressedVcf,
                       glob=InputSelector("outputFilename"))
        ]

    def bind_metadata(self):
        from datetime import date

        self.metadata.contributors = ["Michael Franklin"]
        self.metadata.dateCreated = date(2019, 1, 24)
        self.metadata.dateUpdated = date(2019, 1, 24)
        self.metadata.doi = "http://www.ncbi.nlm.nih.gov/pubmed/19505943"
        self.metadata.citation = (
            "Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, "
            "and 1000 Genome Project Data Processing Subgroup, The Sequence alignment/map (SAM) "
            "format and SAMtools, Bioinformatics (2009) 25(16) 2078-9")
        self.metadata.documentationUrl = (
            "https://samtools.github.io/bcftools/bcftools.html#norm")
        self.metadata.documentation = """\
Left-align and normalize indels, check if REF alleles match the reference,
split multiallelic sites into multiple rows; recover multiallelics from
multiple rows. Left-alignment and normalization will only be applied if
the --fasta-ref option is supplied.
"""

    additional_args = [
        ToolInput(
            "checkRef",
            String(optional=True),
            prefix="-c",
            doc=
            "--check-ref e|w|x|s: what to do when incorrect or missing REF allele is encountered: "
            "exit (e), warn (w), exclude (x), or set/fix (s) bad sites. The w option can be combined with "
            "x and s. Note that s can swap alleles and will update genotypes (GT) and AC counts, but will "
            "not attempt to fix PL or other fields. Also note, and this cannot be stressed enough, that s "
            "will NOT fix strand issues in your VCF, do NOT use it for that purpose!!! "
            "(Instead see http://samtools.github.io/bcftools/howtos/plugin.af-dist.html "
            "and http://samtools.github.io/bcftools/howtos/plugin.fixref.html.)",
        ),
        ToolInput(
            "removeDups",
            String(optional=True),
            prefix="-d",
            doc=
            "--rm-dup: snps|indels|both|all|none. If a record is present multiple times, "
            "output only the first instance, see --collapse in Common Options.",
        ),
        ToolInput(
            "removeDupsAcrossFiles",
            Boolean(optional=True),
            prefix="-D",
            doc="--remove-duplicates: If a record is present in multiple files, "
            "output only the first instance. Alias for -d none, deprecated.",
        ),
        ToolInput(
            "reference",
            FastaFai(optional=True),
            prefix="-f",
            doc=
            "--fasta-ref: reference sequence. Supplying this option will turn on left-alignment and "
            "normalization, however, see also the --do-not-normalize option below.",
        ),
        ToolInput(
            "multiallelics",
            String(optional=True),
            prefix="-m",
            default="-",
            doc=
            "--multiallelics -|+[snps|indels|both|any]: split multiallelic sites into "
            "biallelic records (-) or join biallelic sites into multiallelic records (+). "
            "An optional type string can follow which controls variant types which should "
            "be split or merged together: If only SNP records should be split or merged, "
            "specify snps; if both SNPs and indels should be merged separately into two "
            "records, specify both; if SNPs and indels should be merged into a single record, specify any.",
        ),
        ToolInput(
            "noVersion",
            Boolean(optional=True),
            prefix="--no-version",
            doc=
            "Do not append version and command line information to the output VCF header.",
        ),
        ToolInput(
            "noNormalize",
            Boolean(optional=True),
            prefix="-N",
            doc=
            "--do-not-normalize: the -c s option can be used to fix or set the REF allele from the reference"
            " -f. The -N option will not turn on indel normalisation as the -f option normally implies",
        ),
        ToolInput(
            "outputType",
            String(optional=True),
            prefix="-O",
            default="z",
            doc=
            "--output-type b|u|z|v: Output compressed BCF (b), uncompressed BCF (u), "
            "compressed VCF (z), uncompressed VCF (v). Use the -Ou option when piping "
            "between bcftools subcommands to speed up performance by removing "
            "unnecessary compression/decompression and VCF←→BCF conversion.",
        ),
        ToolInput(
            "regions",
            String(optional=True),
            prefix="-r",
            doc=
            "--regions chr|chr:pos|chr:from-to|chr:from-[,…]: Comma-separated list of regions, "
            "see also -R, --regions-file. Note that -r cannot be used in combination with -R.",
        ),
        ToolInput(
            "regionsFile",
            File(optional=True),
            prefix="-R",
            doc=
            "--regions-file: Regions can be specified either on command line or in a VCF, BED, or "
            "tab-delimited file (the default). The columns of the tab-delimited file are: CHROM, POS, "
            "and, optionally, POS_TO, where positions are 1-based and inclusive. The columns of the "
            "tab-delimited BED file are also CHROM, POS and POS_TO (trailing columns are ignored), "
            "but coordinates are 0-based, half-open. To indicate that a file be treated as BED rather "
            "than the 1-based tab-delimited file, the file must have the '.bed' or '.bed.gz' suffix "
            "(case-insensitive). Uncompressed files are stored in memory, while bgzip-compressed and "
            "tabix-indexed region files are streamed. Note that sequence names must match exactly, 'chr20'"
            " is not the same as '20'. Also note that chromosome ordering in FILE will be respected, "
            "the VCF will be processed in the order in which chromosomes first appear in FILE. "
            "However, within chromosomes, the VCF will always be processed in ascending genomic coordinate "
            "order no matter what order they appear in FILE. Note that overlapping regions in FILE can "
            "result in duplicated out of order positions in the output. This option requires indexed "
            "VCF/BCF files. Note that -R cannot be used in combination with -r.",
        ),
        ToolInput(
            "strictFilter",
            Boolean(optional=True),
            prefix="-s",
            doc=
            "--strict-filter: when merging (-m+), merged site is PASS only if all sites being merged PASS",
        ),
        ToolInput(
            "targets",
            Array(File(), optional=True),
            prefix="-t",
            doc=
            "--targets: [^]chr|chr:pos|chr:from-to|chr:from-[,…]: Similar as -r, --regions, but the next "
            "position is accessed by streaming the whole VCF/BCF rather than using the tbi/csi index. "
            "Both -r and -t options can be applied simultaneously: -r uses the index to jump to a region "
            "and -t discards positions which are not in the targets. Unlike -r, targets can be prefixed "
            "with '^' to request logical complement. For example, '^X,Y,MT' indicates that sequences "
            "X, Y and MT should be skipped. Yet another difference between the two is that -r checks "
            "both start and end positions of indels, whereas -t checks start positions only. "
            "Note that -t cannot be used in combination with -T. ",
        ),
        ToolInput(
            "targetsFile",
            File(optional=True),
            prefix="-T",
            doc=
            "--targets-file: Same -t, --targets, but reads regions from a file. "
            "Note that -T cannot be used in combination with -t. With the call -C alleles command, "
            "third column of the targets file must be comma-separated list of alleles, starting with "
            "the reference allele. Note that the file must be compressed and index. "
            "Such a file can be easily created from a VCF using: "
            "`bcftools query -f'%CHROM\\t%POS\\t%REF,%ALT\\n' file.vcf | bgzip -c > als.tsv.gz "
            "&& tabix -s1 -b2 -e2 als.tsv.gz`",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            prefix="--threads",
            doc=
            "Number of output compression threads to use in addition to main thread. "
            "Only used when --output-type is b or z. Default: 0.",
        ),
        ToolInput(
            "siteWin",
            Int(optional=True),
            prefix="-w",
            doc=
            "--site-win: maximum distance between two records to consider when locally "
            "sorting variants which changed position during the realignment",
        ),
    ]
 def inputs(self):
     return [
         # ToolInput(tag="version", input_type=Boolean(), prefix="--version", separate_value_from_prefix=True,
         #           doc="show program's version number and exit"),
         # ToolInput(tag="help", input_type=Boolean(), prefix="--help", separate_value_from_prefix=True,
         #           doc="(-h) show this help message and exit"),
         # ToolInput(tag="allhelp", input_type=Boolean(), prefix="--allHelp", separate_value_from_prefix=True,
         #           doc="show all extended/hidden options"),
         ToolInput(
             tag="normalBam",
             input_type=BamBai(),
             prefix="--normalBam=",
             separate_value_from_prefix=False,
             position=1,
             doc="Normal sample BAM or CRAM file. (no default)",
         ),
         ToolInput(
             tag="tumorBam",
             input_type=BamBai(),
             prefix="--tumourBam=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "(--tumorBam)  Tumor sample BAM or CRAM file. [required] (no default)",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaFai(),
             prefix="--referenceFasta=",
             position=1,
             separate_value_from_prefix=False,
             doc=" samtools-indexed reference fasta file [required]",
         ),
         ToolInput(
             tag="rundir",
             input_type=Filename(),
             prefix="--runDir=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Name of directory to be created where all workflow scripts and output will be written. "
             "Each analysis requires a separate directory. (default: StrelkaSomaticWorkflow)",
         ),
         ToolInput(
             tag="region",
             input_type=Array(String, optional=True),
             prefix="--region",
             prefix_applies_to_all_elements=True,
             position=1,
             doc=
             "Limit the analysis to one or more genome region(s) for debugging purposes. If this argument "
             "is provided multiple times the union of all specified regions will be analyzed. All regions "
             "must be non-overlapping to get a meaningful result. Examples: '--region chr20' "
             "(whole chromosome), '--region chr2:100-2000 --region chr3:2500-3000' (two regions)'. "
             "If this option is specified (one or more times) together with the 'callRegions' BED file,"
             "then all region arguments will be intersected with the callRegions BED track.",
         ),
         ToolInput(
             tag="config",
             input_type=File(optional=True),
             prefix="--config=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "provide a configuration file to override defaults in global config file "
             "(/opt/strelka/bin/configureStrelkaSomaticWorkflow.py.ini)",
         ),
         ToolInput(
             tag="outputcallableregions",
             input_type=Boolean(optional=True),
             prefix="--outputCallableRegions",
             position=1,
             separate_value_from_prefix=True,
             doc=
             "Output a bed file describing somatic callable regions of the genome",
         ),
         ToolInput(
             tag="indelCandidates",
             input_type=Array(VcfTabix, optional=True),
             prefix="--indelCandidates=",
             prefix_applies_to_all_elements=True,
             position=1,
             separate_value_from_prefix=False,
             doc=
             "Specify a VCF of candidate indel alleles. These alleles are always evaluated "
             "but only reported in the output when they are inferred to exist in the sample. "
             "The VCF must be tabix indexed. All indel alleles must be left-shifted/normalized, "
             "any unnormalized alleles will be ignored. This option may be specified more than once, "
             "multiple input VCFs will be merged. (default: None)",
         ),
         ToolInput(
             tag="forcedgt",
             input_type=Array(VcfTabix, optional=True),
             prefix="--forcedGT=",
             separate_value_from_prefix=False,
             prefix_applies_to_all_elements=True,
             position=1,
             doc=
             "Specify a VCF of candidate alleles. These alleles are always evaluated and reported even "
             "if they are unlikely to exist in the sample. The VCF must be tabix indexed. All indel "
             "alleles must be left- shifted/normalized, any unnormalized allele will trigger a runtime "
             "error. This option may be specified more than once, multiple input VCFs will be merged. "
             "Note that for any SNVs provided in the VCF, the SNV site will be reported (and for gVCF, "
             "excluded from block compression), but the specific SNV alleles are ignored. (default: None)",
         ),
         ToolInput(
             tag="targeted",
             input_type=Boolean(optional=True),
             prefix="--targeted",
             separate_value_from_prefix=True,
             position=1,
             doc="Set options for other targeted input: "
             "note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="exome",
             input_type=Boolean(optional=True),
             prefix="--exome",
             separate_value_from_prefix=True,
             position=1,
             doc=
             "Set options for exome: note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="callRegions",
             input_type=BedTabix(optional=True),
             prefix="--callRegions=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
             "regions to call. No VCF output will be provided outside of these regions. "
             "The full genome will still be used to estimate statistics from the input "
             "(such as expected depth per chromosome). Only one BED file may be specified. "
             "(default: call the entire genome)",
         ),
         ToolInput(
             tag="noisevcf",
             input_type=VcfTabix(optional=True),
             prefix="--noiseVcf=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Noise vcf file (submit argument multiple times for more than one file)",
         ),
         ToolInput(
             tag="scansizemb",
             input_type=Int(optional=True),
             prefix="--scanSizeMb=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Maximum sequence region size (in megabases) scanned by each "
             "task during genome variant calling. (default: 12)",
         ),
         ToolInput(
             tag="callmemmb",
             input_type=Int(optional=True),
             prefix="--callMemMb=",
             position=1,
             separate_value_from_prefix=False,
             doc=
             "Set variant calling task memory limit (in megabytes). It is not recommended to change the "
             "default in most cases, but this might be required for a sample of unusual depth.",
         ),
         ToolInput(
             tag="retaintempfiles",
             input_type=Boolean(optional=True),
             default=False,
             position=1,
             prefix="--retainTempFiles",
             separate_value_from_prefix=True,
             doc="Keep all temporary files (for workflow debugging)",
         ),
         ToolInput(
             tag="disableevs",
             input_type=Boolean(optional=True),
             prefix="--disableEVS",
             position=1,
             separate_value_from_prefix=True,
             doc="Disable empirical variant scoring (EVS).",
         ),
         ToolInput(
             tag="reportevsfeatures",
             input_type=Boolean(optional=True),
             prefix="--reportEVSFeatures",
             position=1,
             separate_value_from_prefix=True,
             doc=
             " Report all empirical variant scoring features in VCF output.",
         ),
         ToolInput(
             tag="snvscoringmodelfile",
             input_type=File(optional=True),
             prefix="--snvScoringModelFile=",
             position=1,
             separate_value_from_prefix=False,
             doc=" Provide a custom empirical scoring model file for SNVs "
             "(default: /opt/strelka/share/config/somaticSNVScoringM odels.json)",
         ),
         ToolInput(
             tag="indelscoringmodelfile",
             input_type=File(optional=True),
             prefix="--indelScoringModelFile=",
             position=1,
             separate_value_from_prefix=False,
             doc=" Provide a custom empirical scoring model file for indels "
             "(default: /opt/strelka/share/config/somaticInde lScoringModels.json)",
         ),
         ToolInput(
             "mode",
             String(optional=True),
             default="local",
             prefix="--mode",
             position=3,
             shell_quote=False,
             doc="(-m MODE)  select run mode (local|sge)",
         ),
         ToolInput(
             "queue",
             String(optional=True),
             prefix="--queue",
             position=3,
             shell_quote=False,
             doc="(-q QUEUE) specify scheduler queue name",
         ),
         ToolInput(
             "memGb",
             String(optional=True),
             prefix="--memGb",
             position=3,
             shell_quote=False,
             doc=" (-g MEMGB) gigabytes of memory available to run workflow "
             "-- only meaningful in local mode, must be an integer (default: Estimate the total "
             "memory for this node for local mode, 'unlimited' for sge mode)",
         ),
         ToolInput(
             "quiet",
             Boolean(optional=True),
             prefix="--quiet",
             position=3,
             shell_quote=False,
             doc="Don't write any log output to stderr "
             "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
         ),
         # ToolInput("mailTo", String(optional=True), prefix="--mailTo", position=3, shell_quote=False,
         #           doc="(-e) send email notification of job completion status to this address "
         #               "(may be provided multiple times for more than one email address)"),
     ]
 def inputs(self):
     return [
         ToolInput(
             tag="bams",
             input_type=Array(BamBai),
             prefix="-b",
             prefix_applies_to_all_elements=True,
             doc="Add FILE to the set of BAM files to be analyzed.",
         ),
         ToolInput(
             tag="bamList",
             input_type=TextFile(optional=True),
             prefix="-L",
             doc="A file containing a list of BAM files to be analyzed.",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaFai(),
             prefix="-f",
             doc=
             " Use FILE as the reference sequence for analysis. An index file (FILE.fai) will be created if none exists. If neither --targets nor --region are specified, FreeBayes will analyze every position in this reference.",
         ),
         ToolInput(
             tag="targetsFile",
             prefix="-t",
             input_type=Bed(optional=True),
             doc=" Limit analysis to targets listed in the BED-format FILE.",
         ),
         ToolInput(
             tag="region",
             prefix="-r",
             input_type=String(optional=True),
             doc=
             "<chrom>:<start_position>-<end_position> Limit analysis to the specified region, 0-base coordinates, end_position not included (same as BED format). Either '-' or '..' maybe used as a separator.",
         ),
         ToolInput(
             tag="samplesFile",
             prefix="-s",
             input_type=TextFile(optional=True),
             doc=
             "FILE  Limit analysis to samples listed (one per line) in the FILE. By default FreeBayes will analyze all samples in its input BAM files.",
         ),
         ToolInput(
             tag="popFile",
             prefix="--populations",
             input_type=TextFile(optional=True),
             doc=
             "FILE Each line of FILE should list a sample and a population which it is part of. The population-based bayesian inference model will then be partitioned on the basis of the populations.",
         ),
         ToolInput(
             tag="cnvFile",
             prefix="-A",
             input_type=TextFile(optional=True),
             doc=
             "FILE Read a copy number map from the BED file FILE, which has either a sample-level ploidy: sample name, copy number or a region-specific format: reference sequence, start, end, sample name, copy number ... for each region in each sample which does not have the default copy number as set by --ploidy.",
         ),
         ToolInput(
             tag="outputFilename",
             prefix="-v",
             input_type=Filename(extension=".vcf"),
             doc="FILE Output VCF-format results to FILE. (default: stdout)",
         ),
         ToolInput(
             tag="gvcfFlag",
             prefix="--gvcf",
             input_type=Boolean(optional=True),
             default=False,
             doc=
             "Write gVCF output, which indicates coverage in uncalled regions.",
         ),
         ToolInput(
             tag="gvcfChunkSize",
             prefix="--gvcf-chunk",
             input_type=Int(optional=True),
             doc=
             " When writing gVCF output emit a record for every NUM bases.",
         ),
         ToolInput(
             tag="candidateVcf",
             prefix="-@",
             input_type=File(optional=True),
             doc=
             " Use variants reported in VCF file as input to the algorithm. Variants in this file will included in the output even if there is not enough support in the data to pass input filters.",
         ),
         ToolInput(
             tag="restrictSitesFlag",
             prefix="-l",
             input_type=Boolean(optional=True),
             doc=
             "Only provide variant calls and genotype likelihoods for sites and alleles which are provided in the VCF input, and provide output in the VCF for all input alleles, not just those which have support in the data.",
         ),
         ToolInput(
             tag="candidateHaploVcf",
             prefix="--haplotype-basis-alleles",
             input_type=File(optional=True),
             doc=
             "When specified, only variant alleles provided in this input VCF will be used for the construction of complex or haplotype alleles.",
         ),
         ToolInput(
             tag="reportHapAllelesFlag",
             prefix="--report-all-haplotype-alleles",
             input_type=Boolean(optional=True),
             doc=
             "At sites where genotypes are made over haplotype alleles, provide information about all alleles in output, not only those which are called.",
         ),
         ToolInput(
             tag="monomorphicFlag",
             prefix="--report-monomorphic",
             input_type=Boolean(optional=True),
             doc=
             " Report even loci which appear to be monomorphic, and report all considered alleles, even those which are not in called genotypes. Loci which do not have any potential alternates have '.' for ALT.",
         ),
         ToolInput(
             tag="polyMoprhProbFlag",
             prefix="-P",
             input_type=Float(optional=True),
             default=0.0,
             doc=
             "Report sites if the probability that there is a polymorphism at the site is greater than N. default: 0.0. Note that post-filtering is generally recommended over the use of this parameter.",
         ),
         ToolInput(
             tag="strictFlag",
             prefix="--strict-vcf",
             input_type=Boolean(optional=True),
             doc="Generate strict VCF format (FORMAT/GQ will be an int)",
         ),
         ToolInput(
             tag="theta",
             prefix="-T",
             input_type=Float(),
             default=0.001,
             doc=
             "The expected mutation rate or pairwise nucleotide diversity among the population under analysis. This serves as the single parameter to the Ewens Sampling Formula prior model default: 0.001",
         ),
         ToolInput(
             tag="ploidy",
             prefix="-p",
             input_type=Int(),
             default=2,
             doc="Sets the default ploidy for the analysis to N. default: 2",
         ),
         ToolInput(
             tag="pooledDiscreteFlag",
             prefix="-J",
             input_type=Boolean(optional=True),
             doc=
             "Assume that samples result from pooled sequencing. Model pooled samples using discrete genotypes across pools. When using this flag, set --ploidy to the number of alleles in each sample or use the --cnv-map to define per-sample ploidy.",
         ),
         ToolInput(
             tag="pooledContinousFlag",
             prefix="-K",
             input_type=Boolean(optional=True),
             doc=
             "Output all alleles which pass input filters, regardles of genotyping outcome or model.",
         ),
         ToolInput(
             tag="addRefFlag",
             prefix="-Z",
             input_type=Boolean(optional=True),
             doc=
             "This flag includes the reference allele in the analysis as if it is another sample from the same population.",
         ),
         ToolInput(
             tag="refQual",
             prefix="--reference-quality",
             input_type=String(),
             default="100,60",
             doc=
             "--reference-quality MQ,BQ  Assign mapping quality of MQ to the reference allele at each site and base quality of BQ. default: 100,60",
         ),
         ToolInput(
             tag="ignoreSNPsFlag",
             prefix="-I",
             input_type=Boolean(optional=True),
             doc="Ignore SNP alleles.",
         ),
         ToolInput(
             tag="ignoreINDELsFlag",
             prefix="-i",
             input_type=Boolean(optional=True),
             doc="Ignore insertion and deletion alleles.",
         ),
         ToolInput(
             tag="ignoreMNPsFlag",
             prefix="-X",
             input_type=Boolean(optional=True),
             doc="Ignore multi-nuceotide polymorphisms, MNPs.",
         ),
         ToolInput(
             tag="ignoreComplexVarsFlag",
             prefix="-u",
             input_type=Boolean(optional=True),
             doc="Ignore complex events (composites of other classes).",
         ),
         ToolInput(
             tag="maxNumOfAlleles",
             prefix="-n",
             input_type=Int(),
             default=0,
             doc=
             "Evaluate only the best N SNP alleles, ranked by sum of supporting quality scores. (Set to 0 to use all; default: all)",
         ),
         ToolInput(
             tag="maxNumOfComplexVars",
             prefix="-E",
             input_type=Int(optional=True),
             doc="",
         ),
         ToolInput(
             tag="haplotypeLength",
             prefix="--haplotype-length",
             input_type=Int(),
             default=3,
             doc=
             "Allow haplotype calls with contiguous embedded matches of up to this length. Set N=-1 to disable clumping. (default: 3)",
         ),
         ToolInput(
             tag="minRepSize",
             prefix="--min-repeat-size",
             input_type=Int(),
             default=5,
             doc=
             "When assembling observations across repeats, require the total repeat length at least this many bp. (default: 5)",
         ),
         ToolInput(
             tag="minRepEntropy",
             prefix="--min-repeat-entropy",
             input_type=Int(),
             default=1,
             doc=
             "To detect interrupted repeats, build across sequence until it has  entropy > N bits per bp. Set to 0 to turn off. (default: 1)",
         ),
         ToolInput(
             tag="noPartObsFlag",
             prefix="--no-partial-observations",
             input_type=Boolean(optional=True),
             doc=
             "Exclude observations which do not fully span the dynamically-determined detection window. (default, use all observations, dividing partial support across matching haplotypes when generating haplotypes.)",
         ),
         ToolInput(
             tag="noNormaliseFlag",
             prefix="-O",
             input_type=Boolean(optional=True),
             doc=
             "Turn off left-alignment of indels, which is enabled by default.",
         ),
         ToolInput(
             tag="useDupFlag",
             prefix="-4",
             input_type=Boolean(),
             default=False,
             doc=
             "Include duplicate-marked alignments in the analysis. default: exclude duplicates marked as such in alignments",
         ),
         ToolInput(
             tag="minMappingQual",
             prefix="-m",
             input_type=Int(),
             default=1,
             doc=
             " Exclude alignments from analysis if they have a mapping quality less than Q. default: 1",
         ),
         ToolInput(
             tag="minBaseQual",
             prefix="-q",
             input_type=Int(),
             default=0,
             doc=
             " -q --min-base-quality Q Exclude alleles from analysis if their supporting base quality is less than Q. default: 0",
         ),
         ToolInput(
             tag="minSupQsum",
             prefix="-R",
             input_type=Int(),
             default=0,
             doc=
             " -R --min-supporting-allele-qsum Q Consider any allele in which the sum of qualities of supporting observations is at least Q. default: 0",
         ),
         ToolInput(
             tag="minSupMQsum",
             prefix="-Y",
             input_type=Int(),
             default=0,
             doc=
             " -Y --min-supporting-mapping-qsum Q Consider any allele in which and the sum of mapping qualities of supporting reads is at least Q. default: 0",
         ),
         ToolInput(
             tag="minSupBQthres",
             prefix="-Q",
             input_type=Int(),
             default=10,
             doc=
             " -Q --mismatch-base-quality-threshold Q Count mismatches toward --read-mismatch-limit if the base quality of the mismatch is >= Q. default: 10",
         ),
         ToolInput(
             tag="readMisMatchLim",
             prefix="-U",
             input_type=Int(optional=True),
             doc=
             " -U --read-mismatch-limit N Exclude reads with more than N mismatches where each mismatch has base quality >= mismatch-base-quality-threshold. default: ~unbounded",
         ),
         ToolInput(
             tag="maxMisMatchFrac",
             prefix="-z",
             input_type=Float(),
             default=1.0,
             doc=
             " -z --read-max-mismatch-fraction N Exclude reads with more than N [0,1] fraction of mismatches where each mismatch has base quality >= mismatch-base-quality-threshold default: 1.0",
         ),
         ToolInput(
             tag="readSNPLim",
             prefix="-$",
             input_type=Int(optional=True),
             doc=
             " -$ --read-snp-limit N Exclude reads with more than N base mismatches, ignoring gaps with quality >= mismatch-base-quality-threshold. default: ~unbounded",
         ),
         ToolInput(
             tag="readINDELLim",
             prefix="-e",
             input_type=Int(optional=True),
             doc=
             " -e --read-indel-limit N Exclude reads with more than N separate gaps. default: ~unbounded",
         ),
         ToolInput(
             tag="standardFilterFlag",
             prefix="-0",
             input_type=Boolean(optional=True),
             doc=
             " -0 --standard-filters Use stringent input base and mapping quality filters Equivalent to -m 30 -q 20 -R 0 -S 0",
         ),
         ToolInput(
             tag="minAltFrac",
             prefix="-F",
             input_type=Float(),
             default=0.05,
             doc=
             " -F --min-alternate-fraction N Require at least this fraction of observations supporting an alternate allele within a single individual in the in order to evaluate the position. default: 0.05",
         ),
         ToolInput(
             tag="minAltCount",
             prefix="-C",
             input_type=Int(),
             default=2,
             doc=
             " -C --min-alternate-count N Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position. default: 2",
         ),
         ToolInput(
             tag="minAltQSum",
             prefix="-3",
             input_type=Int(),
             default=0,
             doc=
             " -3 --min-alternate-qsum N Require at least this sum of quality of observations supporting an alternate allele within a single individual in order to evaluate the position. default: 0",
         ),
         ToolInput(
             tag="minAltTotal",
             prefix="-G",
             input_type=Int(),
             default=1,
             doc=
             " -G --min-alternate-total N Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis. default: 1",
         ),
         ToolInput(
             tag="minCov",
             prefix="--min-coverage",
             input_type=Int(),
             default=0,
             doc=
             " --min-coverage N Require at least this coverage to process a site. default: 0",
         ),
         ToolInput(
             tag="maxCov",
             prefix="--max-coverage",
             input_type=Int(optional=True),
             doc=
             " --max-coverage N Do not process sites with greater than this coverage. default: no limit",
         ),
         ToolInput(
             tag="noPopPriorsFlag",
             prefix="-k",
             input_type=Boolean(optional=True),
             doc=
             " -k --no-population-priors Equivalent to --pooled-discrete --hwe-priors-off and removal of Ewens Sampling Formula component of priors.",
         ),
         ToolInput(
             tag="noHWEPriorsFlag",
             prefix="-w",
             input_type=Boolean(optional=True),
             doc=
             " -w --hwe-priors-off Disable estimation of the probability of the combination arising under HWE given the allele frequency as estimated by observation frequency.",
         ),
         ToolInput(
             tag="noBinOBSPriorsFlag",
             prefix="-V",
             input_type=Boolean(optional=True),
             doc=
             " -V --binomial-obs-priors-off Disable incorporation of prior expectations about observations. Uses read placement probability, strand balance probability, and read position (5'-3') probability.",
         ),
         ToolInput(
             tag="noABPriorsFlag",
             prefix="-a",
             input_type=Boolean(optional=True),
             doc=
             " -a --allele-balance-priors-off Disable use of aggregate probability of observation balance between alleles as a component of the priors.",
         ),
         ToolInput(
             tag="obsBiasFile",
             prefix="--observation-bias",
             input_type=TextFile(optional=True),
             doc=
             " --observation-bias FILE Read length-dependent allele observation biases from FILE. The format is [length] [alignment efficiency relative to reference] where the efficiency is 1 if there is no relative observation bias.",
         ),
         ToolInput(
             tag="baseQualCap",
             prefix="--base-quality-cap",
             input_type=Int(optional=True),
             doc=
             " --base-quality-cap Q Limit estimated observation quality by capping base quality at Q.",
         ),
         ToolInput(
             tag="probContamin",
             prefix="--prob-contamination",
             input_type=Float(),
             default=0.000000001,
             doc=
             " --prob-contamination F An estimate of contamination to use for all samples. default: 10e-9",
         ),
         ToolInput(
             tag="legGLScalc",
             prefix="--legacy-gls",
             input_type=Boolean(optional=True),
             doc=
             " --legacy-gls Use legacy (polybayes equivalent) genotype likelihood calculations",
         ),
         ToolInput(
             tag="contaminEst",
             prefix="--contamination-estimates",
             input_type=TextFile(optional=True),
             doc=
             " --contamination-estimates FILE A file containing per-sample estimates of contamination, such as those generated by VerifyBamID. The format should be: sample p(read=R|genotype=AR) p(read=A|genotype=AA) Sample '*' can be used to set default contamination estimates.",
         ),
         ToolInput(
             tag="repoprtMaxGLFlag",
             prefix="--report-genotype-likelihood-max",
             input_type=Boolean(optional=True),
             doc=
             " --report-genotype-likelihood-max Report genotypes using the maximum-likelihood estimate provided from genotype likelihoods.",
         ),
         ToolInput(
             tag="genotypingMaxIter",
             prefix="-B",
             input_type=Int(),
             default=1000,
             doc=
             " -B --genotyping-max-iterations N Iterate no more than N times during genotyping step. default: 1000.",
         ),
         ToolInput(
             tag="genotypingMaxBDepth",
             prefix="--genotyping-max-banddepth",
             input_type=Int(),
             default=6,
             doc=
             " --genotyping-max-banddepth N Integrate no deeper than the Nth best genotype by likelihood when genotyping. default: 6.",
         ),
         ToolInput(
             tag="postIntegrationLim",
             prefix="-W",
             input_type=String(),
             default="1,3",
             doc=
             " -W --posterior-integration-limits N,M Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood. default: 1,3.",
         ),
         ToolInput(
             tag="excludeUnObsGT",
             prefix="-N",
             input_type=Boolean(optional=True),
             doc=
             " -N --exclude-unobserved-genotypes Skip sample genotypings for which the sample has no supporting reads.",
         ),
         ToolInput(
             tag="gtVarThres",
             prefix="-S",
             input_type=Int(optional=True),
             doc=
             " -S --genotype-variant-threshold N Limit posterior integration to samples where the second-best genotype likelihood is no more than log(N) from the highest genotype likelihood for the sample. default: ~unbounded",
         ),
         ToolInput(
             tag="useMQFlag",
             prefix="-j",
             input_type=Boolean(optional=True),
             doc=
             " -j --use-mapping-quality Use mapping quality of alleles when calculating data likelihoods.",
         ),
         ToolInput(
             tag="harmIndelQualFlag",
             prefix="-H",
             input_type=Boolean(optional=True),
             doc=
             " -H --harmonic-indel-quality Use a weighted sum of base qualities around an indel, scaled by the distance from the indel. By default use a minimum BQ in flanking sequence.",
         ),
         ToolInput(
             tag="readDepFact",
             prefix="-D",
             input_type=Float(),
             default=0.9,
             doc=
             " -D --read-dependence-factor N Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations. default: 0.9",
         ),
         ToolInput(
             tag="gtQuals",
             prefix="-=",
             input_type=Boolean(optional=True),
             doc=
             " -= --genotype-qualities Calculate the marginal probability of genotypes and report as GQ in each sample field in the VCF output.",
         ),
     ]
Exemple #7
0
class MantaBase(IlluminaToolBase, ABC):
    def tool(self):
        return "manta"

    def base_command(self):
        return None

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def inputs(self) -> List[ToolInput]:
        return [*self.config_inputs, *self.running_inputs]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("python",
                       File(),
                       glob=InputSelector("runDir") + "/runWorkflow.py"),
            ToolOutput(
                "pickle",
                File(),
                glob=InputSelector("runDir") + "/runWorkflow.py.config.pickle",
            ),
            ToolOutput(
                "candidateSV",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/candidateSV.vcf.gz",
            ),
            ToolOutput(
                "candidateSmallIndels",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/candidateSmallIndels.vcf.gz",
            ),
            ToolOutput(
                "diploidSV",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/diploidSV.vcf.gz",
            ),
            ToolOutput(
                "alignmentStatsSummary",
                File(),
                glob=InputSelector("runDir") +
                "/results/stats/alignmentStatsSummary.txt",
            ),
            ToolOutput(
                "svCandidateGenerationStats",
                Tsv(),
                glob=InputSelector("runDir") +
                "/results/stats/svCandidateGenerationStats.tsv",
            ),
            ToolOutput(
                "svLocusGraphStats",
                Tsv(),
                glob=InputSelector("runDir") +
                "/results/stats/svLocusGraphStats.tsv",
            ),
            ToolOutput(
                "somaticSVs",
                VcfTabix(optional=True),
                glob=InputSelector("runDir") +
                "/results/variants/somaticSV.vcf.gz",
            ),
        ]

    def arguments(self) -> List[ToolArgument]:
        return [
            ToolArgument("configManta.py", position=0, shell_quote=False),
            ToolArgument(
                StringFormatter(";") + InputSelector("runDir") +
                "/runWorkflow.py",
                position=2,
                shell_quote=False,
            ),
            ToolArgument(
                CpuSelector(None),
                position=3,
                shell_quote=False,
                prefix="-j",
                doc="(-j) number of jobs, must be an integer or 'unlimited' "
                "(default: Estimate total cores on this node for local mode, 128 for sge mode)",
            ),
        ]

    @abstractmethod
    def container(self):
        raise Exception("Strelka version must override docker command")

    def friendly_name(self):
        return "Manta"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2019, 2, 12),
            dateUpdated=date(2019, 2, 19),
            institution="Illumina",
            doi=" doi:10.1093/bioinformatics/btv710",
            citation=
            "Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and "
            "cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710",
            keywords=["illumina", "manta", "variant caller"],
            documentationUrl="https://github.com/Illumina/manta",
            documentation="""
Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads.
It is optimized for analysis of germline variation in small sets of individuals and somatic
variation in tumor/normal sample pairs. Manta discovers, assembles and scores large-scale SVs,
medium-sized indels and large insertions within a single efficient workflow. The method is
designed for rapid analysis on standard compute hardware: NA12878 at 50x genomic coverage is
analyzed in less than 20 minutes on a 20 core server, and most WGS tumor/normal analyses
can be completed within 2 hours. Manta combines paired and split-read evidence during SV
discovery and scoring to improve accuracy, but does not require split-reads or successful
breakpoint assemblies to report a variant in cases where there is strong evidence otherwise.

It provides scoring models for germline variants in small sets of diploid samples and somatic
variants in matched tumor/normal sample pairs. There is experimental support for analysis of
unmatched tumor samples as well. Manta accepts input read mappings from BAM or CRAM files and
reports all SV and indel inferences in VCF 4.1 format. See the user guide for a full description
of capabilities and limitations.""".strip(),
        )

    config_inputs = [
        ToolInput(
            "config",
            File(optional=True),
            prefix="--config",
            position=1,
            shell_quote=False,
            doc=
            "provide a configuration file to override defaults in global config file "
            "(/opt/conda/share/manta-1.2.1-0/bin/configManta.py.ini)",
        ),
        ToolInput(
            "bam",
            BamBai(),
            prefix="--bam",
            position=1,
            shell_quote=False,
            doc=
            "FILE Normal sample BAM or CRAM file. May be specified more than once, multiple inputs "
            "will be treated as each BAM file representing a different sample. [optional] (no default)",
        ),
        ToolInput(
            "runDir",
            Filename(),
            prefix="--runDir",
            position=1,
            shell_quote=False,
            doc=
            "Run script and run output will be written to this directory [required] "
            "(default: MantaWorkflow)",
        ),
        ToolInput(
            "reference",
            FastaFai(),
            prefix="--referenceFasta",
            position=1,
            shell_quote=False,
            doc="samtools-indexed reference fasta file [required]",
        ),
        ToolInput(
            "tumorBam",
            BamBai(optional=True),
            prefix="--tumorBam",
            position=1,
            shell_quote=False,
            doc=
            "Tumor sample BAM or CRAM file. Only up to one tumor bam file accepted. [optional=null]",
        ),
        ToolInput(
            "exome",
            Boolean(optional=True),
            prefix="--exome",
            position=1,
            shell_quote=False,
            doc="Set options for WES input: turn off depth filters",
        ),
        ToolInput(
            "rna",
            Boolean(optional=True),
            prefix="--rna",
            position=1,
            shell_quote=False,
            doc=
            "Set options for RNA-Seq input. Must specify exactly one bam input file",
        ),
        ToolInput(
            "unstrandedRNA",
            Boolean(optional=True),
            prefix="--unstrandedRNA",
            position=1,
            shell_quote=False,
            doc=
            "Set if RNA-Seq input is unstranded: Allows splice-junctions on either strand",
        ),
        ToolInput(
            "outputContig",
            Boolean(optional=True),
            prefix="--outputContig",
            position=1,
            shell_quote=False,
            doc="Output assembled contig sequences in VCF file",
        ),
        ToolInput(
            "callRegions",
            BedTabix(optional=True),
            prefix="--callRegions",
            position=1,
            shell_quote=False,
            doc=
            "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
            "regions to call. No VCF output will be provided outside of these regions. The full "
            "genome will still be used to estimate statistics from the input (such as expected depth "
            "per chromosome). Only one BED file may be specified. (default: call the entire genome)",
        ),
    ]

    running_inputs = [
        ToolInput(
            "mode",
            String(optional=True),
            default="local",
            prefix="--mode",
            position=3,
            shell_quote=False,
            doc="(-m) select run mode (local|sge)",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--quiet",
            position=3,
            shell_quote=False,
            doc="Don't write any log output to stderr "
            "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
        ),
        ToolInput(
            "queue",
            String(optional=True),
            prefix="--queue",
            position=3,
            shell_quote=False,
            doc="(-q) specify scheduler queue name",
        ),
        ToolInput(
            "memgb",
            Int(optional=True),
            prefix="--memGb",
            position=3,
            shell_quote=False,
            doc=
            "(-g) gigabytes of memory available to run workflow -- only meaningful in local mode, "
            "must be an integer (default: Estimate the total memory for this node for local  mode, "
            "'unlimited' for sge mode)",
        ),
        # ToolInput("dryRun", Boolean(optional=True), prefix="--dryRun", position=3, shell_quote=False,
        #           doc="(-d) dryRun workflow code without actually running command - tasks"),
        ToolInput(
            "maxTaskRuntime",
            String(optional=True),
            prefix="--maxTaskRuntime",
            position=3,
            shell_quote=False,
            doc=
            "(format: hh:mm:ss) Specify scheduler max runtime per task, argument is "
            "provided to the 'h_rt' resource limit if using SGE (no default)",
        ),
    ]