Beispiel #1
0
 def inputs(self):
     return [
         ToolInput(
             "ubam",
             Bam(),
             prefix="--UNMAPPED_BAM",
             prefix_applies_to_all_elements=True,
             doc="Original SAM or BAM file of unmapped reads, which must be in queryname order.",
             position=10,
         ),
         ToolInput(
             "bam",
             Array(Sam()),
             prefix="--ALIGNED_BAM",
             prefix_applies_to_all_elements=True,
             doc="SAM or BAM file(s) with alignment data.",
             position=10,
         ),
         ToolInput(
             "reference",
             FastaWithDict(optional=True),
             prefix="--REFERENCE_SEQUENCE",
             position=10,
             doc="Reference sequence file.",
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".bam"),
             position=10,
             prefix="--OUTPUT",
             doc="Merged SAM or BAM file to write to.",
         ),
         *self.additional_args,
     ]
Beispiel #2
0
 def inputs(self):
     return [
         *self.additional_inputs,
         ToolInput(
             "inputBam",
             Bam(optional=True),
             prefix="-ibam",
             doc=
             "Input bam file. Note: BAM _must_ be sorted by position. A 'samtools sort <BAM>' should suffice.",
         ),
         ToolInput(
             "inputBed",
             File(optional=True),
             prefix="-iBed",
             doc=
             "Input bed file. Must be grouped by chromosome. A simple 'sort -k 1,1 <BED> > <BED>.sorted' will suffice.",
         ),
         ToolInput(
             "inputFile",
             File(optional=True),
             prefix="-i",
             doc="Input file, can be gff/vcf.",
         ),
         ToolInput(
             "genome",
             File(optional=True),
             prefix="-g",
             doc=
             "Genome file. The genome file should tab delimited and structured as follows: <chromName><TAB><chromSize>.",
         ),
     ]
 def inputs(self):
     return [
         ToolInput("bams", Array(Bam()), position=10),
         ToolInput("reference",
                   FastaWithDict(),
                   position=1,
                   prefix="--reference"),
         ToolInput(
             "outputFilename",
             Filename(suffix=".svs", extension=".vcf"),
             position=2,
             prefix="--output",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembled", extension=".bam"),
             position=3,
             prefix="--assembly",
         ),
         ToolInput("threads",
                   Int(optional=True),
                   default=CpuSelector(),
                   prefix="--threads"),
         ToolInput("blacklist",
                   Bed(optional=True),
                   position=4,
                   prefix="--blacklist"),
         ToolInput("tmpdir",
                   String(optional=True),
                   default="./TMP",
                   prefix="--workingdir"),
     ]
 def outputs(self):
     return [
         ToolOutput("out", Vcf(), glob=InputSelector("outputFilename")),
         ToolOutput("assembly",
                    Bam(),
                    glob=InputSelector("assemblyFilename")),
     ]
Beispiel #5
0
 def inputs(self):
     return [
         *super(Gatk4SortSamBase, self).inputs(),
         ToolInput(
             "bam",
             Bam(),
             prefix="-I",
             doc="The SAM/BAM/CRAM file to sort.",
             position=10,
         ),
         ToolInput(
             "outputFilename",
             Filename(
                 prefix=InputSelector("bam", remove_file_extension=True),
                 suffix=".sorted",
                 extension=".bam",
             ),
             position=10,
             prefix="-O",
             doc="The sorted SAM/BAM/CRAM output file.",
         ),
         ToolInput(
             "sortOrder",
             String(),
             prefix="-SO",
             position=10,
             doc=
             "The --SORT_ORDER argument is an enumerated type (SortOrder), which can have one of "
             "the following values: [unsorted, queryname, coordinate, duplicate, unknown]",
         ),
         *Gatk4SortSamBase.additional_args,
     ]
Beispiel #6
0
 def outputs(self) -> List[ToolOutput]:
     return [
         ToolOutput("out", Bam(), glob=WildcardSelector("*")),
         ToolOutput("used_options",
                    File(optional=True),
                    glob=WildcardSelector("HygeaLogs/*.json")),
     ]
Beispiel #7
0
 def inputs(self):
     return [
         *super(SamToolsViewBase, self).inputs(),
         *SamToolsViewBase.additional_inputs,
         ToolInput("sam", UnionType(Sam(), Bam(), Cram()), position=10),
         ToolInput(
             "reference",
             FastaWithDict(optional=True),
             position=6,
             prefix="-T",
             doc=
             "A FASTA format reference FILE, optionally compressed by bgzip and ideally indexed "
             "by samtools faidx. If an index is not present, one will be generated for you.",
         ),
         ToolInput(
             "outputFilename",
             Filename(
                 prefix=InputSelector("sam", remove_file_extension=True),
                 extension=".bam",
             ),
             position=5,
             prefix="-o",
             doc="Output to FILE [stdout].",
         ),
         ToolInput(
             "regions",
             Array(String, optional=True),
             position=11,
             doc=
             "Region specifications after the input filename to restrict output to only those alignments which "
             "overlap the specified region(s). Use of region specifications requires a coordinate-sorted and "
             "indexed input file (in BAM or CRAM format)",
         ),
     ]
Beispiel #8
0
 def tests(self):
     remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data"
     return [
         TTestCase(
             name="basic",
             input={
                 "sam": f"{remote_dir}/NA12878-BRCA1.bwamem.stdout",
                 "reference":
                 f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta",
                 "threads": 16,
             },
             output=Bam.basic_test(
                 "out",
                 2740774,
                 f"{remote_dir}/NA12878-BRCA1.bam.flagstat",
                 "9a6af420f287df52a122ac723f41b535",
             ),
         ),
         TTestCase(
             name="minimal",
             input={
                 "sam": f"{remote_dir}/NA12878-BRCA1.bwamem.stdout",
                 "reference":
                 f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta",
                 "threads": 16,
             },
             output=self.minimal_test(),
         ),
     ]
Beispiel #9
0
 def outputs(self):
     return [
         ToolOutput(
             "out", Stdout(Bam(),
                           stdoutname=InputSelector("outputFilename"))),
         ToolOutput("metrics", File(),
                    glob=WildcardSelector("metrics.txt")),
     ]
Beispiel #10
0
 def outputs(self):
     return [
         ToolOutput("out", Stdout(Bam())),
         ToolOutput(
             "metrics",
             File(),
             glob=WildcardSelector("metrics.txt", select_first=True),
         ),
     ]
Beispiel #11
0
 def inputs(self):
     return [
         ToolInput("inputFilename", Bam(), position=200),
         ToolInput(
             "reference", FastaFai(), prefix="-r", doc="Reference sequence file."
         ),
         ToolInput("outputFilename", Filename(extension=".bam")),
         *ScrambleBase.additional_inputs,
     ]
 def outputs(self) -> List[ToolOutput]:
     return [
         ToolOutput(
             "out_unsorted_bam",
             Bam(optional=True),
             glob=InputSelector("outFileNamePrefix") + "Aligned.out.bam",
         ),
         ToolOutput(
             "out_sorted_bam",
             Bam(optional=True),
             glob=InputSelector("outFileNamePrefix") +
             "Aligned.sortedByCoord.out.bam",
         ),
         ToolOutput(
             "SJ_out_tab",
             File,
             glob=InputSelector("outFileNamePrefix") + "SJ.out.tab",
             doc=
             "Each splicing is counted in the numbers of splices, which would correspond to summing the counts in SJ.out.tab.",
         ),
         ToolOutput(
             "Log_out",
             File,
             glob=InputSelector("outFileNamePrefix") + "Log.out",
             doc=
             "main log file with a lot of detailed information about the run. This file is most useful for troubleshooting and debugging.",
         ),
         ToolOutput(
             "Log_progress_out",
             File,
             glob=InputSelector("outFileNamePrefix") + "Log.progress.out",
             doc=
             "reports job progress statistics, such as the number of processed reads, % of mapped reads etc.",
         ),
         ToolOutput(
             "Log_final_out",
             File,
             glob=InputSelector("outFileNamePrefix") + "Log.final.out",
             doc=
             "summary mapping statistics after mapping job is complete, very useful for quality control.",
         ),
     ]
Beispiel #13
0
 def tests(self):
     return [
         TTestCase(
             name="basic",
             input={
                 "sam":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.bwamem.stdout",
                 ),
                 "reference":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.chr17.fasta",
                 ),
                 "threads":
                 16,
             },
             output=Bam.basic_test(
                 "out",
                 2740774,
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.bam.flagstat",
                 ),
                 "9a6af420f287df52a122ac723f41b535",
             ),
         ),
         TTestCase(
             name="minimal",
             input={
                 "sam":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.bwamem.stdout",
                 ),
                 "reference":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.chr17.fasta",
                 ),
                 "threads":
                 16,
             },
             output=self.minimal_test(),
         ),
     ]
Beispiel #14
0
 def inputs(self):
     return [
         *self.additional_inputs,
         ToolInput(
             "inputABam",
             Bam(),
             prefix="-a",
             doc="input file a: only bam is supported at the moment",
         ),
         ToolInput(
             "inputBBed",
             Array(Bed()),
             prefix="-b",
             doc=
             "input file b: only bed is supported at the moment. May be followed with multiple databases and/or  wildcard (*) character(s). ",
         ),
     ]
Beispiel #15
0
 def inputs(self):
     return [
         *self.additional_inputs,
         ToolInput(
             "inputABed",
             Bed(),
             prefix="-a",
             doc=
             "input file a: only bed is supported. May be followed with multiple databases and/or  wildcard (*) character(s). ",
         ),
         ToolInput(
             "inputBBam",
             Bam(),
             prefix="-b",
             doc="input file b: only bam is supported.",
         ),
         ToolInput(
             "histogram",
             Boolean(optional=True),
             prefix="-hist",
             doc=
             "Report a histogram of coverage for each feature in A as well as a summary histogram for _all_ features in A. Output (tab delimited) after each feature in A: 1) depth 2) # bases at depth 3) size of A 4) % of A at depth.",
         ),
         ToolInput(
             "depth",
             Boolean(optional=True),
             prefix="-d",
             doc=
             "Report the depth at each position in each A feature. Positions reported are one based.  Each position and depth follow the complete A feature.",
         ),
         ToolInput(
             "counts",
             Boolean(optional=True),
             prefix="-counts",
             doc=
             "Only report the count of overlaps, don't compute fraction, etc.",
         ),
         ToolInput(
             "mean",
             Boolean(optional=True),
             prefix="-mean",
             doc="Report the mean depth of all positions in each A feature.",
         ),
     ]
Beispiel #16
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("piscesVersion", String()),
         ToolInput(
             "inputBam",
             Bam(),
             prefix="--bam",
             position=4,
             shell_quote=False,
             doc="Input Bam to Stitch",
         ),
         ToolInput(
             "outputDir",
             String(),
             prefix="--outfolder",
             position=4,
             shell_quote=False,
             doc="Output file directory",
         ),
         ToolInput(
             "sampleName", String(), doc="Sample name for naming outputs"
         ),
         *self.additional_stitcher_args,
     ]
Beispiel #17
0
    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput(
                "inputFile",
                CompressedVcf(),
                prefix="--input_file",
                doc="Input file name. Can use compressed file (gzipped).",
            ),
            ToolInput(
                "outputFilename",
                Filename(
                    prefix=InputSelector("inputFile", remove_file_extension=True),
                    extension=".vcf",
                ),
                prefix="--output_file",
                doc="(-o) Output file name. Results can write to STDOUT by specifying "
                ' as the output file name - this will force quiet mode. Default = "variant_effect_output.txt"',
            ),
            ToolInput(
                "vcf",
                Boolean(),
                default=True,
                prefix="--vcf",
                doc="Writes output in VCF format. Consequences are added in the INFO field of the VCF file, using the "
                'key "CSQ". Data fields are encoded separated by "|"; the order of fields is written in the VCF header.'
                ' Output fields in the "CSQ" INFO field can be selected by using --fields. If the input format was VCF,'
                " the file will remain unchanged save for the addition of the CSQ field (unless using any filtering). "
                "Custom data added with --custom are added as separate fields, using the key specified for each data "
                "file. Commas in fields are replaced with ampersands (&) to preserve VCF format.",
            ),
            # ToolInput('plugin', [PLUGINS](optional=True), prefix='--plugin',
            #           doc='Use named plugin. Plugin modules should be installed in the Plugins subdirectory of the VEP cache directory (defaults to $HOME/.vep/). Multiple plugins can be used by supplying the --plugin flag multiple times. See plugin documentation. Not used by default'),
            ToolInput(
                "help",
                Boolean(optional=True),
                prefix="--help",
                doc="Display help message and quit",
            ),
            ToolInput(
                "quiet",
                Boolean(optional=True),
                prefix="--quiet",
                doc="(-q) Suppress warning messages.Not used by default",
            ),
            ToolInput(
                "verbose",
                Boolean(optional=True),
                prefix="--verbose",
                doc="(-v) Print out a bit more information while running. Not used by default",
            ),
            ToolInput(
                "config",
                File(optional=True),
                prefix="--config",
                doc="""Load configuration options from a config file. The config file should consist of whitespace-separated pairs of option names and settings e.g.:

            output_file   my_output.txt
            species       mus_musculus
            format        vcf
            host          useastdb.ensembl.org

            A config file can also be implicitly read; save the file as $HOME/.vep/vep.ini (or equivalent directory if 
            using --dir). Any options in this file will be overridden by those specified in a config file using --config, 
            and in turn by any options specified on the command line. You can create a quick version file of this by 
            setting the flags as normal and running VEP in verbose (-v) mode. This will output lines that can be copied 
            to a config file that can be loaded in on the next run using --config. Not used by default""",
            ),
            ToolInput(
                "everything",
                Boolean(optional=True),
                prefix="--everything",
                doc="(-e) Shortcut flag to switch on all of the following: --sift b, --polyphen b, --ccds, "
                "--uniprot, --hgvs, --symbol, --numbers, --domains, --regulatory, --canonical, --protein, "
                "--biotype, --uniprot, --tsl, --appris, --gene_phenotype --af, --af_1kg, --af_esp, "
                "--af_gnomad, --max_af, --pubmed, --variant_class, --mane",
            ),
            ToolInput(
                "species",
                String(optional=True),
                prefix="--species",
                doc='Species for your data. This can be the latin name e.g. "homo_sapiens" or any Ensembl alias e.g. '
                '"mouse". Specifying the latin name can speed up initial database connection as the registry does '
                'not have to load all available database aliases on the server. Default = "homo_sapiens"',
            ),
            ToolInput(
                "assembly",
                String(optional=True),
                prefix="--assembly",
                doc="""(-a) Select the assembly version to use if more than one available. If using the cache, you must 
                have the appropriate assembly's cache file installed. If not specified and you have only 1 assembly 
                version installed, this will be chosen by default. Default = use found assembly version""",
            ),
            ToolInput(
                "inputData",
                String(optional=True),
                prefix="--input_data",
                doc="(--id) Raw input data as a string. May be used, for example, to input a single rsID or HGVS "
                "notation quickly to vep: --input_data rs699",
            ),
            ToolInput(
                "format",
                String(optional=True),
                prefix="--format",
                doc='Input file format - one of "ensembl", "vcf", "hgvs", "id", "region", "spdi". By default, '
                "VEP auto-detects the input file format. Using this option you can specify the input file is "
                "Ensembl, VCF, IDs, HGVS, SPDI or region format. Can use compressed version (gzipped) of any "
                "file format listed above. Auto-detects format by default",
            ),
            ToolInput(
                "forceOverwrite",
                Boolean(optional=True),
                prefix="--force_overwrite",
                doc="(--force) By default, VEP will fail with an error if the output file already exists. You can "
                "force the overwrite of the existing file by using this flag. Not used by default",
            ),
            ToolInput(
                "statsFile",
                String(optional=True),
                default="variant_effect_output.txt_summary.html",
                prefix="--stats_file",
                doc="(--sf) Summary stats file name. This is an HTML file containing a summary of the VEP run - the "
                'file name must end ".htm" or ".html". Default = "variant_effect_output.txt_summary.html"',
            ),
            ToolInput(
                "noStats",
                Boolean(optional=True),
                prefix="--no_stats",
                doc="""Don\'t generate a stats file. Provides marginal gains in run time.""",
            ),
            ToolInput(
                "statsText",
                Boolean(optional=True),
                prefix="--stats_text",
                doc="Generate a plain text stats file in place of the HTML.",
            ),
            ToolInput(
                "warningFile",
                Filename(suffix="warning", extension=".txt"),
                prefix="--warning_file",
                doc="File name to write warnings and errors to. Default = STDERR (standard error)",
            ),
            ToolInput(
                "maxSvSize",
                Boolean(optional=True),
                prefix="--max_sv_size",
                doc="Extend the maximum Structural Variant size VEP can process.",
            ),
            ToolInput(
                "noCheckVariantsOrder",
                Boolean(optional=True),
                prefix="--no_check_variants_order",
                doc="Permit the use of unsorted input files. However running VEP on unsorted input files slows down "
                "the tool and requires more memory.",
            ),
            ToolInput(
                "fork",
                Int(optional=True),
                default=CpuSelector(),
                prefix="--fork",
                doc="Enable forking, using the specified number of forks. Forking can dramatically improve runtime. "
                "Not used by default",
            ),
            ToolInput(
                "custom",
                Array(BedTabix, optional=True),
                prefix="--custom",
                prefix_applies_to_all_elements=True,
                doc="Add custom annotation to the output. Files must be tabix indexed or in the bigWig format. "
                "Multiple files can be specified by supplying the --custom flag multiple times. "
                "See https://asia.ensembl.org/info/docs/tools/vep/script/vep_custom.html for full details. "
                "Not used by default",
            ),
            ToolInput(
                "gff",
                File(optional=True),
                prefix="--gff",
                doc="Use GFF transcript annotations in [filename] as an annotation source. "
                "Requires a FASTA file of genomic sequence.Not used by default",
            ),
            ToolInput(
                "gtf",
                File(optional=True),
                prefix="--gtf",
                doc="Use GTF transcript annotations in [filename] as an annotation source. "
                "Requires a FASTA file of genomic sequence.Not used by default",
            ),
            ToolInput(
                "bam",
                Bam(optional=True),
                prefix="--bam",
                doc="ADVANCED Use BAM file of sequence alignments to correct transcript models not derived from "
                "reference genome sequence. Used to correct RefSeq transcript models. "
                "Enables --use_transcript_ref; add --use_given_ref to override this behaviour. Not used by default",
            ),
            ToolInput(
                "useTranscriptRef",
                Boolean(optional=True),
                prefix="--use_transcript_ref",
                doc="By default VEP uses the reference allele provided in the input file to calculate consequences "
                "for the provided alternate allele(s). Use this flag to force VEP to replace the provided "
                "reference allele with sequence derived from the overlapped transcript. This is especially "
                "relevant when using the RefSeq cache, see documentation for more details. The GIVEN_REF and "
                "USED_REF fields are set in the output to indicate any change. Not used by default",
            ),
            ToolInput(
                "useGivenRef",
                Boolean(optional=True),
                prefix="--use_given_ref",
                doc="Using --bam or a BAM-edited RefSeq cache by default enables --use_transcript_ref; add this flag "
                "to override this behaviour and use the provided reference allele from the input. Not used by default",
            ),
            ToolInput(
                "customMultiAllelic",
                Boolean(optional=True),
                prefix="--custom_multi_allelic",
                doc="By default, comma separated lists found within the INFO field of custom annotation VCFs are "
                "assumed to be allele specific. For example, a variant with allele_string A/G/C with associated "
                'custom annotation "single,double,triple" will associate triple with C, double with G and single '
                "with A. This flag instructs VEP to return all annotations for all alleles. Not used by default",
            ),
            ToolInput(
                "tab",
                Boolean(optional=True),
                prefix="--tab",
                doc="Writes output in tab-delimited format. Not used by default",
            ),
            ToolInput(
                "json",
                Boolean(optional=True),
                prefix="--json",
                doc="Writes output in JSON format. Not used by default",
            ),
            ToolInput(
                "compressOutput",
                String(optional=True),
                default="bgzip",
                prefix="--compress_output",
                doc="Writes output compressed using either gzip or bgzip. Not used by default",
            ),
            ToolInput(
                "fields",
                Array(String, optional=True),
                prefix="--fields",
                doc="""Configure the output format using a comma separated list of fields.
Can only be used with tab (--tab) or VCF format (--vcf) output.
For the tab format output, the selected fields may be those present in the default output columns, or 
any of those that appear in the Extra column (including those added by plugins or custom annotations). 
Output remains tab-delimited. For the VCF format output, the selected fields are those present within the ""CSQ"" INFO field.

Example of command for the tab output:

--tab --fields ""Uploaded_variation,Location,Allele,Gene""
Example of command for the VCF format output:

--vcf --fields ""Allele,Consequence,Feature_type,Feature""
Not used by default""",
            ),
            ToolInput(
                "minimal",
                Boolean(optional=True),
                prefix="--minimal",
                doc="Convert alleles to their most minimal representation before consequence calculation i.e. "
                "sequence that is identical between each pair of reference and alternate alleles is trimmed "
                "off from both ends, with coordinates adjusted accordingly. Note this may lead to discrepancies "
                "between input coordinates and coordinates reported by VEP relative to transcript sequences; "
                "to avoid issues, use --allele_number and/or ensure that your input variants have unique "
                "identifiers. The MINIMISED flag is set in the VEP output where relevant. Not used by default",
            ),
            ToolInput(
                "variantClass",
                Boolean(optional=True),
                prefix="--variant_class",
                doc="Output the Sequence Ontology variant class. Not used by default",
            ),
            ToolInput(
                "sift",
                String(optional=True),
                prefix="--sift",
                doc="Species limited SIFT predicts whether an amino acid substitution affects protein function based "
                "on sequence homology and the physical properties of amino acids. VEP can output the prediction "
                "term, score or both. Not used by default",
            ),
            ToolInput(
                "polyphen",
                String(optional=True),
                prefix="--polyphen",
                doc="Human only PolyPhen is a tool which predicts possible impact of an amino acid substitution on "
                "the structure and function of a human protein using straightforward physical and comparative "
                "considerations. VEP can output the prediction term, score or both. VEP uses the humVar score "
                "by default - use --humdiv to retrieve the humDiv score. Not used by default",
            ),
            ToolInput(
                "humdiv",
                Boolean(optional=True),
                prefix="--humdiv",
                doc="Human only Retrieve the humDiv PolyPhen prediction instead of the default humVar. "
                "Not used by default",
            ),
            ToolInput(
                "nearest",
                String(optional=True),
                prefix="--nearest",
                doc="""Retrieve the transcript or gene with the nearest protein-coding transcription start site 
                (TSS) to each input variant. Use ""transcript"" to retrieve the transcript stable ID, ""gene"" to 
                retrieve the gene stable ID, or ""symbol"" to retrieve the gene symbol. Note that the nearest 
                TSS may not belong to a transcript that overlaps the input variant, and more than one may be 
                reported in the case where two are equidistant from the input coordinates.

            Currently only available when using a cache annotation source, and requires the Set::IntervalTree perl module.
            Not used by default""",
            ),
            ToolInput(
                "distance",
                Array(Int, optional=True),
                separator=",",
                prefix="--distance",
                doc="Modify the distance up and/or downstream between a variant and a transcript for which VEP will assign the upstream_gene_variant or downstream_gene_variant consequences. Giving one distance will modify both up- and downstream distances; prodiving two separated by commas will set the up- (5') and down - (3') stream distances respectively. Default: 5000",
            ),
            ToolInput(
                "overlaps",
                Boolean(optional=True),
                prefix="--overlaps",
                doc="Report the proportion and length of a transcript overlapped by a structural variant in VCF format.",
            ),
            ToolInput(
                "genePhenotype",
                Boolean(optional=True),
                prefix="--gene_phenotype",
                doc="Indicates if the overlapped gene is associated with a phenotype, disease or trait. See list of phenotype sources. Not used by default",
            ),
            ToolInput(
                "regulatory",
                Boolean(optional=True),
                prefix="--regulatory",
                doc="Look for overlaps with regulatory regions. VEP can also report if a variant falls in a high information position within a transcription factor binding site. Output lines have a Feature type of RegulatoryFeature or MotifFeature. Not used by default",
            ),
            ToolInput(
                "cellType",
                Boolean(optional=True),
                prefix="--cell_type",
                doc="Report only regulatory regions that are found in the given cell type(s). Can be a single cell type or a comma-separated list. The functional type in each cell type is reported under CELL_TYPE in the output. To retrieve a list of cell types, use --cell_type list. Not used by default",
            ),
            ToolInput(
                "individual",
                Array(String, optional=True),
                prefix="--individual",
                separator=",",
                doc='Consider only alternate alleles present in the genotypes of the specified individual(s). May be a single individual, a comma-separated list or "all" to assess all individuals separately. Individual variant combinations homozygous for the given reference allele will not be reported. Each individual and variant combination is given on a separate line of output. Only works with VCF files containing individual genotype data; individual IDs are taken from column headers. Not used by default',
            ),
            ToolInput(
                "phased",
                Boolean(optional=True),
                prefix="--phased",
                doc="Force VCF genotypes to be interpreted as phased. For use with plugins that depend on phased data. Not used by default",
            ),
            ToolInput(
                "alleleNumber",
                Boolean(optional=True),
                prefix="--allele_number",
                doc="Identify allele number from VCF input, where 1 = first ALT allele, 2 = second ALT allele etc. Useful when using --minimal Not used by default",
            ),
            ToolInput(
                "showRefAllele",
                Boolean(optional=True),
                prefix="--show_ref_allele",
                doc='Adds the reference allele in the output. Mainly useful for the VEP "default" and tab-delimited output formats. Not used by default',
            ),
            ToolInput(
                "totalLength",
                Boolean(optional=True),
                prefix="--total_length",
                doc="Give cDNA, CDS and protein positions as Position/Length. Not used by default",
            ),
            ToolInput(
                "numbers",
                Boolean(optional=True),
                prefix="--numbers",
                doc="Adds affected exon and intron numbering to to output. Format is Number/Total. Not used by default",
            ),
            ToolInput(
                "noEscape",
                Boolean(optional=True),
                prefix="--no_escape",
                doc="Don't URI escape HGVS strings. Default = escape",
            ),
            ToolInput(
                "keepCsq",
                Boolean(optional=True),
                prefix="--keep_csq",
                doc="Don't overwrite existing CSQ entry in VCF INFO field. Overwrites by default",
            ),
            ToolInput(
                "vcfInfoField",
                String(optional=True),
                prefix="--vcf_info_field",
                doc='Change the name of the INFO key that VEP write the consequences to in its VCF output. Use "ANN" for compatibility with other tools such as snpEff. Default: CSQ',
            ),
            ToolInput(
                "terms",
                String(optional=True),
                prefix="--terms",
                doc='(-t) The type of consequence terms to output. The Ensembl terms are described here. The Sequence Ontology is a joint effort by genome annotation centres to standardise descriptions of biological sequences. Default = "SO"',
            ),
            ToolInput(
                "noHeaders",
                Boolean(optional=True),
                prefix="--no_headers",
                doc="Don't write header lines in output files. Default = add headers",
            ),
            ToolInput(
                "hgvs",
                Boolean(optional=True),
                prefix="--hgvs",
                doc="Add HGVS nomenclature based on Ensembl stable identifiers to the output. Both coding and protein sequence names are added where appropriate. To generate HGVS identifiers when using --cache or --offline you must use a FASTA file and --fasta. HGVS notations given on Ensembl identifiers are versioned. Not used by default",
            ),
            ToolInput(
                "hgvsg",
                Boolean(optional=True),
                prefix="--hgvsg",
                doc="Add genomic HGVS nomenclature based on the input chromosome name. To generate HGVS identifiers when using --cache or --offline you must use a FASTA file and --fasta. Not used by default",
            ),
            ToolInput(
                "shiftHgvs",
                Boolean(optional=True),
                prefix="--shift_hgvs",
                doc="""Enable or disable 3\' shifting of HGVS notations. When enabled, this causes ambiguous insertions or deletions (typically in repetetive sequence tracts) to be "shifted" to their most 3' possible coordinates (relative to the transcript sequence and strand) before the HGVS notations are calculated; the flag HGVS_OFFSET is set to the number of bases by which the variant has shifted, relative to the input genomic coordinates. Disabling retains the original input coordinates of the variant. Default: 1 (shift)""",
            ),
            ToolInput(
                "transcriptVersion",
                Boolean(optional=True),
                prefix="--transcript_version",
                doc="Add version numbers to Ensembl transcript identifiers",
            ),
            ToolInput(
                "protein",
                Boolean(optional=True),
                prefix="--protein",
                doc="Add the Ensembl protein identifier to the output where appropriate. Not used by default",
            ),
            ToolInput(
                "symbol",
                Boolean(optional=True),
                prefix="--symbol",
                doc="Adds the gene symbol (e.g. HGNC) (where available) to the output. Not used by default",
            ),
            ToolInput(
                "ccds",
                Boolean(optional=True),
                prefix="--ccds",
                doc="Adds the CCDS transcript identifer (where available) to the output. Not used by default",
            ),
            ToolInput(
                "uniprot",
                Boolean(optional=True),
                prefix="--uniprot",
                doc="Adds best match accessions for translated protein products from three UniProt-related databases (SWISSPROT, TREMBL and UniParc) to the output. Not used by default",
            ),
            ToolInput(
                "tsl",
                Boolean(optional=True),
                prefix="--tsl",
                doc="Adds the transcript support level for this transcript to the output. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "appris",
                Boolean(optional=True),
                prefix="--appris",
                doc="Adds the APPRIS isoform annotation for this transcript to the output. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "canonical",
                Boolean(optional=True),
                prefix="--canonical",
                doc="Adds a flag indicating if the transcript is the canonical transcript for the gene. Not used by default",
            ),
            ToolInput(
                "mane",
                Boolean(optional=True),
                prefix="--mane",
                doc="Adds a flag indicating if the transcript is the MANE Select transcript for the gene. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "biotype",
                Boolean(optional=True),
                prefix="--biotype",
                doc="Adds the biotype of the transcript or regulatory feature. Not used by default",
            ),
            ToolInput(
                "domains",
                Boolean(optional=True),
                prefix="--domains",
                doc="Adds names of overlapping protein domains to output. Not used by default",
            ),
            ToolInput(
                "xrefRefseq",
                Boolean(optional=True),
                prefix="--xref_refseq",
                doc="Output aligned RefSeq mRNA identifier for transcript. Not used by default. Note: The RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND FREQUENTLY WILL NOT, match exactly in sequence, exon structure and protein product",
            ),
            ToolInput(
                "synonyms",
                Tsv(optional=True),
                prefix="--synonyms",
                doc="Load a file of chromosome synonyms. File should be tab-delimited with the primary identifier in column 1 and the synonym in column 2. Synonyms allow different chromosome identifiers to be used in the input file and any annotation source (cache, database, GFF, custom file, FASTA file). Not used by default",
            ),
            ToolInput(
                "checkExisting",
                Boolean(optional=True),
                prefix="--check_existing",
                doc="""Checks for the existence of known variants that are co-located with your input. By default the alleles are compared and variants on an allele-specific basis - to compare only coordinates, use --no_check_alleles.

            Some databases may contain variants with unknown (null) alleles and these are included by default; to exclude them use --exclude_null_alleles.

            See this page for more details.

            Not used by default""",
            ),
            ToolInput(
                "checkSvs",
                Boolean(optional=True),
                prefix="--check_svs",
                doc="Checks for the existence of structural variants that overlap your input. Currently requires database access. Not used by default",
            ),
            ToolInput(
                "clinSigAllele",
                Boolean(optional=True),
                prefix="--clin_sig_allele",
                doc="Return allele specific clinical significance. Setting this option to 0 will provide all known clinical significance values at the given locus. Default: 1 (Provide allele-specific annotations)",
            ),
            ToolInput(
                "excludeNullAlleles",
                Boolean(optional=True),
                prefix="--exclude_null_alleles",
                doc="Do not include variants with unknown alleles when checking for co-located variants. Our human database contains variants from HGMD and COSMIC for which the alleles are not publically available; by default these are included when using --check_existing, use this flag to exclude them. Not used by default",
            ),
            ToolInput(
                "noCheckAlleles",
                Boolean(optional=True),
                prefix="--no_check_alleles",
                doc="""When checking for existing variants, by default VEP only reports a co-located variant if none of the input alleles are novel. For example, if your input variant has alleles A/G, and an existing co-located variant has alleles A/C, the co-located variant will not be reported.

            Strand is also taken into account - in the same example, if the input variant has alleles T/G but on the negative strand, then the co-located variant will be reported since its alleles match the reverse complement of input variant.

            Use this flag to disable this behaviour and compare using coordinates alone. Not used by default""",
            ),
            ToolInput(
                "af",
                Boolean(optional=True),
                prefix="--af",
                doc="Add the global allele frequency (AF) from 1000 Genomes Phase 3 data for any known co-located variant to the output. For this and all --af_* flags, the frequency reported is for the input allele only, not necessarily the non-reference or derived allele. Not used by default",
            ),
            ToolInput(
                "maxAf",
                Boolean(optional=True),
                prefix="--max_af",
                doc="Report the highest allele frequency observed in any population from 1000 genomes, ESP or gnomAD. Not used by default",
            ),
            ToolInput(
                "af1kg",
                String(optional=True),
                prefix="--af_1kg",
                doc="Add allele frequency from continental populations (AFR,AMR,EAS,EUR,SAS) of 1000 Genomes Phase 3 to the output. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "afEsp",
                Boolean(optional=True),
                prefix="--af_esp",
                doc="Include allele frequency from NHLBI-ESP populations. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "afGnomad",
                Boolean(optional=True),
                prefix="--af_gnomad",
                doc="Include allele frequency from Genome Aggregation Database (gnomAD) exome populations. Note only data from the gnomAD exomes are included; to retrieve data from the additional genomes data set, see this guide. Must be used with --cache Not used by default",
            ),
            ToolInput(
                "afExac",
                Boolean(optional=True),
                prefix="--af_exac",
                doc="Include allele frequency from ExAC project populations. Must be used with --cache. Not used by default. Note: ExAC data has been superceded by gnomAD. This flag remains for those wishing to use older cache versions containing ExAC data.",
            ),
            ToolInput(
                "pubmed",
                Boolean(optional=True),
                prefix="--pubmed",
                doc="Report Pubmed IDs for publications that cite existing variant. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "failed",
                Boolean(optional=True),
                prefix="--failed",
                doc="When checking for co-located variants, by default VEP will exclude variants that have been flagged as failed. Set this flag to include such variants. Default: 0 (exclude)",
            ),
            ToolInput(
                "gencodeBasic",
                Boolean(optional=True),
                prefix="--gencode_basic",
                doc="Limit your analysis to transcripts belonging to the GENCODE basic set. This set has fragmented or problematic transcripts removed. Not used by default",
            ),
            ToolInput(
                "excludePredicted",
                Boolean(optional=True),
                prefix="--exclude_predicted",
                doc='When using the RefSeq or merged cache, exclude predicted transcripts (i.e. those with identifiers beginning with "XM_" or "XR_").',
            ),
            ToolInput(
                "transcriptFilter",
                Boolean(optional=True),
                prefix="--transcript_filter",
                doc='''ADVANCED Filter transcripts according to any arbitrary set of rules. Uses similar notation to filter_vep.

            You may filter on any key defined in the root of the transcript object; most commonly this will be ""stable_id"":

            --transcript_filter ""stable_id match N[MR]_""''',
            ),
            ToolInput(
                "checkRef",
                Boolean(optional=True),
                prefix="--check_ref",
                doc="Force VEP to check the supplied reference allele against the sequence stored in the Ensembl Core database or supplied FASTA file. Lines that do not match are skipped. Not used by default",
            ),
            ToolInput(
                "lookupRef",
                Boolean(optional=True),
                prefix="--lookup_ref",
                doc="Force overwrite the supplied reference allele with the sequence stored in the Ensembl Core database or supplied FASTA file. Not used by default",
            ),
            ToolInput(
                "dontSkip",
                Boolean(optional=True),
                prefix="--dont_skip",
                doc="Don't skip input variants that fail validation, e.g. those that fall on unrecognised sequences. Combining --check_ref with --dont_skip will add a CHECK_REF output field when the given reference does not match the underlying reference sequence.",
            ),
            ToolInput(
                "allowNonVariant",
                Boolean(optional=True),
                prefix="--allow_non_variant",
                doc="When using VCF format as input and output, by default VEP will skip non-variant lines of input (where the ALT allele is null). Enabling this option the lines will be printed in the VCF output with no consequence data added.",
            ),
            ToolInput(
                "chr",
                Array(String, optional=True),
                prefix="--chr",
                separator=",",
                doc='Select a subset of chromosomes to analyse from your file. Any data not on this chromosome in the input will be skipped. The list can be comma separated, with "-" characters representing an interval. For example, to include chromosomes 1, 2, 3, 10 and X you could use --chr 1-3,10,X Not used by default',
            ),
            ToolInput(
                "codingOnly",
                Boolean(optional=True),
                prefix="--coding_only",
                doc="Only return consequences that fall in the coding regions of transcripts. Not used by default",
            ),
            ToolInput(
                "noIntergenic",
                Boolean(optional=True),
                prefix="--no_intergenic",
                doc="Do not include intergenic consequences in the output. Not used by default",
            ),
            ToolInput(
                "pick",
                Boolean(optional=True),
                prefix="--pick",
                doc="Pick once line or block of consequence data per variant, including transcript-specific columns. Consequences are chosen according to the criteria described here, and the order the criteria are applied may be customised with --pick_order. This is the best method to use if you are interested only in one consequence per variant. Not used by default",
            ),
            ToolInput(
                "pickAllele",
                Boolean(optional=True),
                prefix="--pick_allele",
                doc="Like --pick, but chooses one line or block of consequence data per variant allele. Will only differ in behaviour from --pick when the input variant has multiple alternate alleles. Not used by default",
            ),
            ToolInput(
                "perGene",
                Boolean(optional=True),
                prefix="--per_gene",
                doc="Output only the most severe consequence per gene. The transcript selected is arbitrary if more than one has the same predicted consequence. Uses the same ranking system as --pick. Not used by default",
            ),
            ToolInput(
                "pickAlleleGene",
                Boolean(optional=True),
                prefix="--pick_allele_gene",
                doc="Like --pick_allele, but chooses one line or block of consequence data per variant allele and gene combination. Not used by default",
            ),
            ToolInput(
                "flagPick",
                Boolean(optional=True),
                prefix="--flag_pick",
                doc="As per --pick, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "flagPickAllele",
                Boolean(optional=True),
                prefix="--flag_pick_allele",
                doc="As per --pick_allele, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "flagPickAlleleGene",
                Boolean(optional=True),
                prefix="--flag_pick_allele_gene",
                doc="As per --pick_allele_gene, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "pickOrder",
                Array(String, optional=True),
                prefix="--pick_order",
                separator=",",
                doc="""Customise the order of criteria (and the list of criteria) applied when choosing a block of annotation data with one of the following options: --pick, --pick_allele, --per_gene, --pick_allele_gene, --flag_pick, --flag_pick_allele, --flag_pick_allele_gene. See this page for the default order.
            Valid criteria are: [ canonical appris tsl biotype ccds rank length mane ]. e.g.:

            --pick --pick_order tsl,appris,rank""",
            ),
            ToolInput(
                "mostSevere",
                Boolean(optional=True),
                prefix="--most_severe",
                doc="Output only the most severe consequence per variant. Transcript-specific columns will be left blank. Consequence ranks are given in this table. To include regulatory consequences, use the --regulatory option in combination with this flag. Not used by default",
            ),
            ToolInput(
                "summary",
                Boolean(optional=True),
                prefix="--summary",
                doc="Output only a comma-separated list of all observed consequences per variant. Transcript-specific columns will be left blank. Not used by default",
            ),
            ToolInput(
                "filterCommon",
                Boolean(optional=True),
                prefix="--filter_common",
                doc="Shortcut flag for the filters below - this will exclude variants that have a co-located existing variant with global AF > 0.01 (1%). May be modified using any of the following freq_* filters. Not used by default",
            ),
            ToolInput(
                "checkFrequency",
                Boolean(optional=True),
                prefix="--check_frequency",
                doc="Turns on frequency filtering. Use this to include or exclude variants based on the frequency of co-located existing variants in the Ensembl Variation database. You must also specify all of the --freq_* flags below. Frequencies used in filtering are added to the output under the FREQS key in the Extra field. Not used by default",
            ),
            ToolInput(
                "freqPop",
                String(optional=True),
                prefix="--freq_pop",
                doc="Name of the population to use in frequency filter. This must be one of the following: (1KG_ALL, 1KG_AFR, 1KG_AMR, 1KG_EAS, 1KG_EUR, 1KG_SAS, AA, EA, gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS)",
            ),
            ToolInput(
                "freqFreq",
                Float(optional=True),
                prefix="--freq_freq",
                doc="Allele frequency to use for filtering. Must be a float value between 0 and 1",
            ),
            ToolInput(
                "freqGtLt",
                String(optional=True),
                prefix="--freq_gt_lt",
                doc="Specify whether the frequency of the co-located variant must be greater than (gt) or less than (lt) the value specified with --freq_freq",
            ),
            ToolInput(
                "freqFilter",
                String(optional=True),
                prefix="--freq_filter",
                doc="Specify whether to exclude or include only variants that pass the frequency filter",
            ),
            # CADD plugin
            ToolInput("caddReference", Array(VcfTabix, optional=True)),
            # Condel
            ToolInput(
                "condelConfig",
                Directory(optional=True),
                doc="Directory containing CondelPlugin config, in format: '<dir>/condel_SP.conf'",
            ),
            # dbNSFP
            ToolInput("dbnspReference", VcfTabix(optional=True), doc=""),
            ToolInput("dbsnpColumns", Array(String, optional=True)),
            # REVEL
            ToolInput("revelReference", VcfTabix(optional=True)),
            # CUSTOM
            ToolInput("custom1Reference", VcfTabix(optional=True)),
            ToolInput("custom1Columns", Array(String, optional=True)),
            ToolInput("custom2Reference", VcfTabix(optional=True)),
            ToolInput("custom2Columns", Array(String, optional=True)),
        ]
Beispiel #18
0
 def outputs(self):
     return [ToolOutput("out", Bam(), glob=InputSelector("outputFilename"))]
Beispiel #19
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             tag="bam",
             input_type=Bam(),
             prefix="--INPUT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-I) The BAM or SAM file to fix. Required."),
         ),
         ToolInput(
             tag="outputFilename",
             input_type=Filename(prefix=InputSelector("bam"),
                                 suffix=".sorted",
                                 extension=".bam"),
             prefix="--OUTPUT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-O) The fixed BAM or SAM output file. Required."),
         ),
         ToolInput(
             tag="reference",
             input_type=FastaWithIndexes(),
             prefix="--REFERENCE_SEQUENCE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-R) Reference sequence file. Required."),
         ),
         ToolInput(
             tag="arguments_file",
             input_type=Array(File, optional=True),
             prefix="--arguments_file",
             separate_value_from_prefix=True,
             prefix_applies_to_all_elements=True,
             doc=InputDocumentation(
                 doc=
                 "read one or more arguments files and add them to the command line This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         # ToolInput(
         #     tag="compression_level",
         #     input_type=Int(optional=True),
         #     prefix="--COMPRESSION_LEVEL",
         #     separate_value_from_prefix=True,
         #     doc=InputDocumentation(
         #         doc="Compression level for all compressed files created (e.g. BAM and VCF). Default value: 2."
         #     ),
         # ),
         ToolInput(
             tag="create_index",
             input_type=Boolean(optional=True),
             prefix="--CREATE_INDEX",
             separate_value_from_prefix=True,
             default=True,
             doc=InputDocumentation(
                 doc=
                 "Whether to create a BAM index when writing a coordinate-sorted BAM file. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="create_md5_file",
             input_type=Boolean(optional=True),
             prefix="--CREATE_MD5_FILE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether to create an MD5 digest for any BAM or FASTQ files created. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="ga4gh_client_secrets",
             input_type=Boolean(optional=True),
             prefix="--GA4GH_CLIENT_SECRETS",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Default value: client_secrets.json."),
         ),
         ToolInput(
             tag="help",
             input_type=Boolean(optional=True),
             prefix="--help",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-h) display the help message Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="is_bisulfite_sequence",
             input_type=Boolean(optional=True),
             prefix="--IS_BISULFITE_SEQUENCE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Whether the file contains bisulfite sequence (used when calculating the NM tag).  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="max_records_in_ram",
             input_type=Int(optional=True),
             prefix="--MAX_RECORDS_IN_RAM",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When writing files that need to be sorted, this will specify the number of records stored in RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort the file, and increases the amount of RAM needed.  Default value: 500000. "
             ),
         ),
         ToolInput(
             tag="quiet",
             input_type=Boolean(optional=True),
             prefix="--QUIET",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether to suppress job-summary info on System.err. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="set_only_uq",
             input_type=Boolean(optional=True),
             prefix="--SET_ONLY_UQ",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Only set the UQ tag, ignore MD and NM. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="tmp_dir",
             input_type=File(optional=True),
             prefix="--TMP_DIR",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "One or more directories with space available to be used by this program for temporary storage of working files  This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         ToolInput(
             tag="use_jdk_deflater",
             input_type=Boolean(optional=True),
             prefix="--USE_JDK_DEFLATER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-use_jdk_deflater)  Use the JDK Deflater instead of the Intel Deflater for writing compressed output  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="use_jdk_inflater",
             input_type=Boolean(optional=True),
             prefix="--USE_JDK_INFLATER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-use_jdk_inflater)  Use the JDK Inflater instead of the Intel Inflater for reading compressed input  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="validation_stringency",
             input_type=Boolean(optional=True),
             prefix="--VALIDATION_STRINGENCY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Validation stringency for all SAM files read by this program.  Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.  Default value: STRICT. Possible values: {STRICT, LENIENT, SILENT} "
             ),
         ),
         ToolInput(
             tag="verbosity",
             input_type=Boolean(optional=True),
             prefix="--VERBOSITY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Control verbosity of logging. Default value: INFO. Possible values: {ERROR, WARNING, INFO, DEBUG} "
             ),
         ),
         ToolInput(
             tag="version",
             input_type=Boolean(optional=True),
             prefix="--version",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "display the version number for this tool Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="showhidden",
             input_type=Boolean(optional=True),
             prefix="--showHidden",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-showHidden)  display hidden arguments  Default value: false. Possible values: {true, false} "
             ),
         ),
     ]
Beispiel #20
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             tag="inp",
             input_type=Bam(),
             prefix="--INPUT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-I) Input file (BAM or SAM or a GA4GH url). Required."
             ),
         ),
         ToolInput(
             tag="outputFilename",
             input_type=Filename(
                 prefix=InputSelector("inp", remove_file_extension=True),
                 extension=".bam",
             ),
             prefix="--OUTPUT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-O) Output file (BAM or SAM). Required."),
         ),
         ToolInput(
             tag="rglb",
             input_type=String(),
             prefix="--RGLB",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-LB) Read-Group library Required."),
         ),
         ToolInput(
             tag="rgpl",
             input_type=String(),
             prefix="--RGPL",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-PL) Read-Group platform (e.g. ILLUMINA, SOLID) Required."
             ),
         ),
         ToolInput(
             tag="rgpu",
             input_type=String(),
             prefix="--RGPU",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-PU) Read-Group platform unit (eg. run barcode) Required."
             ),
         ),
         ToolInput(
             tag="rgsm",
             input_type=String(),
             prefix="--RGSM",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-SM) Read-Group sample name Required."),
         ),
         ToolInput(
             tag="arguments_file",
             input_type=File(optional=True),
             prefix="--arguments_file",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "read one or more arguments files and add them to the command line This argument may be "
                 "specified 0 or more times. Default value: null. "),
         ),
         # ToolInput(
         #     tag="compression_level",
         #     input_type=Int(optional=True),
         #     prefix="--COMPRESSION_LEVEL",
         #     separate_value_from_prefix=True,
         #     doc=InputDocumentation(
         #         doc="Compression level for all compressed files created (e.g. BAM and VCF). Default value: 2."
         #     ),
         # ),
         ToolInput(
             tag="create_index",
             input_type=Boolean(optional=True),
             prefix="--CREATE_INDEX",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether to create a BAM index when writing a coordinate-sorted BAM file. "
                 "Default value: false. Possible values: {true, false} "),
         ),
         ToolInput(
             tag="create_md5_file",
             input_type=Boolean(optional=True),
             prefix="--CREATE_MD5_FILE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether to create an MD5 digest for any BAM or FASTQ files created. "
                 "Default value: false. Possible values: {true, false} "),
         ),
         ToolInput(
             tag="ga4gh_client_secrets",
             input_type=Boolean(optional=True),
             prefix="--GA4GH_CLIENT_SECRETS",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Default value: client_secrets.json."),
         ),
         ToolInput(
             tag="help",
             input_type=Boolean(optional=True),
             prefix="--help",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-h) display the help message Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="max_records_in_ram",
             input_type=Int(optional=True),
             prefix="--MAX_RECORDS_IN_RAM",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When writing files that need to be sorted, this will specify the number of records "
                 "stored in RAM before spilling to disk. Increasing this number reduces the number of file "
                 "handles needed to sort the file, and increases the amount of RAM needed.  "
                 "Default value: 500000. "),
         ),
         ToolInput(
             tag="quiet",
             input_type=Boolean(optional=True),
             prefix="--QUIET",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Whether to suppress job-summary info on System.err. "
                 "Default value: false. Possible values: {true, false} "),
         ),
         ToolInput(
             tag="reference_sequence",
             input_type=File(optional=True),
             prefix="--REFERENCE_SEQUENCE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-R) Reference sequence file. Default value: null."),
         ),
         ToolInput(
             tag="rgcn",
             input_type=String(optional=True),
             prefix="--RGCN",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-CN) Read-Group sequencing center name Default value: null."
             ),
         ),
         ToolInput(
             tag="rgds",
             input_type=String(optional=True),
             prefix="--RGDS",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-DS) Read-Group description Default value: null."),
         ),
         ToolInput(
             tag="rgdt",
             input_type=Boolean(optional=True),
             prefix="--RGDT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-DT) Read-Group run date Default value: null."),
         ),
         ToolInput(
             tag="rgfo",
             input_type=String(optional=True),
             prefix="--RGFO",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-FO) Read-Group flow order Default value: null."),
         ),
         ToolInput(
             tag="rgid",
             input_type=String(optional=True),
             prefix="--RGID",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-ID) Read-Group ID Default value: 1."),
         ),
         ToolInput(
             tag="rgks",
             input_type=String(optional=True),
             prefix="--RGKS",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-KS) Read-Group key sequence Default value: null."),
         ),
         ToolInput(
             tag="rgpg",
             input_type=String(optional=True),
             prefix="--RGPG",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-PG) Read-Group program group Default value: null."),
         ),
         ToolInput(
             tag="rgpi",
             input_type=Int(optional=True),
             prefix="--RGPI",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-PI) Read-Group predicted insert size Default value: null."
             ),
         ),
         ToolInput(
             tag="rgpm",
             input_type=String(optional=True),
             prefix="--RGPM",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-PM) Read-Group platform model Default value: null."
             ),
         ),
         ToolInput(
             tag="sort_order",
             input_type=String(optional=True),
             prefix="--SORT_ORDER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-SO) Optional sort order to output in. If not supplied OUTPUT is in the same order as INPUT. "
                 "Default value: null. Possible values: {unsorted, queryname, coordinate, duplicate, unknown} "
             ),
         ),
         ToolInput(
             tag="tmp_dir",
             input_type=File(optional=True),
             prefix="--TMP_DIR",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "One or more directories with space available to be used by this program for temporary storage "
                 "of working files  This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         ToolInput(
             tag="use_jdk_deflater",
             input_type=Boolean(optional=True),
             prefix="--USE_JDK_DEFLATER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-use_jdk_deflater)  Use the JDK Deflater instead of the Intel Deflater for writing "
                 "compressed output  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="use_jdk_inflater",
             input_type=Boolean(optional=True),
             prefix="--USE_JDK_INFLATER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-use_jdk_inflater)  Use the JDK Inflater instead of the Intel Inflater for reading "
                 "compressed input  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="validation_stringency",
             input_type=String(optional=True),
             prefix="--VALIDATION_STRINGENCY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Validation stringency for all SAM files read by this program.  Setting stringency to "
                 "SILENT can improve performance when processing a BAM file in which variable-length data "
                 "(read, qualities, tags) do not otherwise need to be decoded.  Default value: STRICT. "
                 "Possible values: {STRICT, LENIENT, SILENT} "),
         ),
         ToolInput(
             tag="verbosity",
             input_type=Boolean(optional=True),
             prefix="--VERBOSITY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Control verbosity of logging. Default value: INFO. "
                 "Possible values: {ERROR, WARNING, INFO, DEBUG} "),
         ),
         ToolInput(
             tag="version",
             input_type=Boolean(optional=True),
             prefix="--version",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "display the version number for this tool Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="showhidden",
             input_type=Boolean(optional=True),
             prefix="--showHidden",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-showHidden)  display hidden arguments  Default value: false. Possible values: {true, false}"
             ),
         ),
     ]
Beispiel #21
0
 def inputs(self):
     return [
         ToolInput("alignedReads", Bam(), position=200),
         ToolInput("outputFilename", Filename(extension=".bam")),
         *BamSorMaDupBase.additional_inputs,
     ]
Beispiel #22
0
 def inputs(self):
     return [
         ToolInput(
             tag="aligned_inp",
             input_type=Bam(),
             prefix="-x",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File in SAM/BAM/CRAM format with main alignments as generated by STAR (Aligned.out.sam). "
                 "Arriba extracts candidate reads from this file. This is sometimes /dev/stdin"
             ),
         ),
         ToolInput(
             tag="inp_chimeric",
             input_type=Bam(optional=True),
             prefix="-c",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR (Chimeric.out.sam). "
                 "This parameter is only required, if STAR was run with the parameter "
                 "'--chimOutType SeparateSAMold'. When STAR was run with the parameter "
                 "'--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c can be omitted. "
             ),
         ),
         ToolInput(
             tag="gtf_file",
             input_type=File(optional=True),
             prefix="-g",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "GTF file with gene annotation. The file may be gzip-compressed."
             ),
         ),
         ToolInput(
             tag="gtf_features",
             input_type=Csv(optional=True),
             prefix="-G",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Comma-/space-separated list of names of GTF features. "
                 "Default: gene_name=gene_name|gene_id gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS "
             ),
         ),
         ToolInput(
             tag="reference",
             input_type=Fasta(optional=True),
             prefix="-a",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "FastA file with genome sequence (assembly). The file may be gzip-compressed. An index with "
                 "the file extension .fai must exist only if CRAM files are processed. "
             ),
         ),
         ToolInput(
             tag="blacklist",
             input_type=File(optional=True),
             prefix="-b",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File containing blacklisted events (recurrent artifacts and transcripts observed in healthy tissue). "
             ),
         ),
         ToolInput(
             tag="known_fusions",
             input_type=Tsv(optional=True),
             prefix="-k",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File containing known/recurrent fusions. Some cancer entities are often characterized by "
                 "fusions between the same pair of genes. In order to boost sensitivity, a list of known "
                 "fusions can be supplied using this parameter. The list must contain two columns with the "
                 "names of the fused genes, separated by tabs. "),
         ),
         ToolInput(
             tag="output_filename",
             input_type=Filename(extension=".tsv"),
             prefix="-o",
             default="fusions.tsv",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Output file with fusions that have passed all filters."
             ),
         ),
         ToolInput(
             tag="discarded_output_filename",
             input_type=Filename(suffix=".discarded", extension=".tsv"),
             prefix="-O",
             separate_value_from_prefix=True,
             default="fusions.discarded.tsv",
             doc=InputDocumentation(
                 doc=
                 "Output file with fusions that were discarded due to filtering."
             ),
         ),
         ToolInput(
             tag="structural_variants_coordinates",
             input_type=Tsv(optional=True),
             prefix="-d",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Tab-separated file with coordinates of structural variants found using whole-genome "
                 "sequencing data. These coordinates serve to increase sensitivity towards weakly expressed "
                 "fusions and to eliminate fusions with low evidence. "),
         ),
         ToolInput(
             tag="max_genomic_breakpoint_distance",
             input_type=Int(optional=True),
             prefix="-D",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When a file with genomic breakpoints obtained via whole-genome sequencing is supplied via "
                 "the -d parameter, this parameter determines how far a genomic breakpoint may be away from a "
                 "transcriptomic breakpoint to consider it as a related event. For events inside genes, the "
                 "distance is added to the end of the gene; for intergenic events, the distance threshold is "
                 "applied as is. Default: 100000 "),
         ),
         ToolInput(
             tag="strandedness",
             input_type=String(optional=True),
             prefix="-s",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether a strand-specific protocol was used for library preparation, and if so, the type of "
                 "strandedness (auto/yes/no/reverse). When unstranded data is processed, the strand can "
                 "sometimes be inferred from splice-patterns. But in unclear situations, stranded data helps"
                 " resolve ambiguities. Default: auto "),
         ),
         ToolInput(
             tag="contigs",
             input_type=Array(String(), optional=True),
             prefix="-i",
             doc=InputDocumentation(
                 doc=
                 "Comma-/space-separated list of interesting contigs. Fusions between genes on other contigs "
                 "are ignored. Contigs can be specified with or without the prefix 'chr'. "
                 "Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y "
             ),
         ),
         ToolInput(
             tag="filters",
             input_type=Array(String, optional=True),
             prefix="-f",
             separator=" ",
             doc=InputDocumentation(
                 doc=
                 "Comma-/space-separated list of filters to disable. By default all filters are enabled. "
                 "Valid values: homopolymer, same_gene, inconsistently_clipped, duplicates, low_entropy, "
                 "no_genomic_support, short_anchor, homologs, blacklist, pcr_fusions, isoforms, intronic, "
                 "uninteresting_contigs, read_through, genomic_support, mismatches, no_coverage, spliced, "
                 "mismappers, merge_adjacent, select_best, many_spliced, long_gap, min_support, "
                 "relative_support, end_to_end, known_fusions, non_coding_neighbors, intragenic_exonic, "
                 "hairpin, small_insert_size "),
         ),
         ToolInput(
             tag="max_e_value",
             input_type=Float(optional=True),
             prefix="-E",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Arriba estimates the number of fusions with a given number of supporting reads which one "
                 "would expect to see by random chance. If the expected number of fusions (e-value) is higher "
                 "than this threshold, the fusion is discarded by the 'relative_support' filter. Note: "
                 "Increasing this threshold can dramatically increase the number of false positives and may "
                 "increase the runtime of resource-intensive steps. Fractional values are possible. "
                 "Default: 0.300000 "),
         ),
         ToolInput(
             tag="min_supporting_reads",
             input_type=Int(optional=True),
             prefix="-S",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'min_support' filter discards all fusions with fewer than this many supporting reads "
                 "(split reads and discordant mates combined). Default: 2 "
             ),
         ),
         ToolInput(
             tag="max_mismappers",
             input_type=Float(optional=True),
             prefix="-m",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When more than this fraction of supporting reads turns out to be mismappers, the "
                 "'mismappers' filter discards the fusion. Default: 0.800000 "
             ),
         ),
         ToolInput(
             tag="max_homolog_identity",
             input_type=Float(optional=True),
             prefix="-L",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Genes with more than the given fraction of sequence identity are considered homologs and "
                 "removed by the 'homologs' filter. Default: 0.300000 "),
         ),
         ToolInput(
             tag="homopolymer_length",
             input_type=Int(optional=True),
             prefix="-H",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'homopolymer' filter removes breakpoints adjacent to homopolymers of the given length "
                 "or more. Default: 6 "),
         ),
         ToolInput(
             tag="read_through_distance",
             input_type=Int(optional=True),
             prefix="-R",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'read_through' filter removes read-through fusions where the breakpoints are "
                 "less than the given distance away from each other. Default: 10000 "
             ),
         ),
         ToolInput(
             tag="min_anchor_length",
             input_type=Int(optional=True),
             prefix="-A",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Alignment artifacts are often characterized by split reads coming from only one gene "
                 "and no discordant mates. Moreover, the split reads only align to a short stretch in one "
                 "of the genes. The 'short_anchor' filter removes these fusions. This parameter sets the "
                 "threshold in bp for what the filter considers short. Default: 23 "
             ),
         ),
         ToolInput(
             tag="many_spliced_events",
             input_type=Int(optional=True),
             prefix="-M",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'many_spliced' filter recovers fusions between genes that have at least this "
                 "many spliced breakpoints. Default: 4 "),
         ),
         ToolInput(
             tag="max_kmer_content",
             input_type=Float(optional=True),
             prefix="-K",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'low_entropy' filter removes reads with repetitive 3-mers. If the 3-mers make up more "
                 "than the given fraction of the sequence, then the read is discarded. Default: 0.600000 "
             ),
         ),
         ToolInput(
             tag="max_mismatch_pvalue",
             input_type=Float(optional=True),
             prefix="-V",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'mismatches' filter uses a binomial model to calculate a p-value for observing a given "
                 "number of mismatches in a read. If the number of mismatches is too high, the read is "
                 "discarded. Default: 0.010000 "),
         ),
         ToolInput(
             tag="fragment_length",
             input_type=Int(optional=True),
             prefix="-F",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When paired-end data is given, the fragment length is estimated automatically and this "
                 "parameter has no effect. But when single-end data is given, the mean fragment length "
                 "should be specified to effectively filter fusions that arise from hairpin structures. "
                 "Default: 200 "),
         ),
         ToolInput(
             tag="max_reads",
             input_type=Int(optional=True),
             prefix="-U",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Subsample fusions with more than the given number of supporting reads. This improves "
                 "performance without compromising sensitivity, as long as the threshold is high. Counting "
                 "of supporting reads beyond the threshold is inaccurate, obviously. Default: 300 "
             ),
         ),
         ToolInput(
             tag="quantile",
             input_type=Float(optional=True),
             prefix="-Q",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Highly expressed genes are prone to produce artifacts during library preparation. Genes "
                 "with an expression above the given quantile are eligible for filtering by the 'pcr_fusions' "
                 "filter. Default: 0.998000 "),
         ),
         ToolInput(
             tag="exonic_fraction",
             input_type=Float(optional=True),
             prefix="-e",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The breakpoints of false-positive predictions of intragenic events are often both in exons. "
                 "True predictions are more likely to have at least one breakpoint in an intron, because "
                 "introns are larger. If the fraction of exonic sequence between two breakpoints is smaller "
                 "than the given fraction, the 'intragenic_exonic' filter discards the event. Default: 0.200000"
             ),
         ),
         ToolInput(
             tag="fusion_transcript",
             input_type=Boolean(optional=True),
             prefix="-T",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When set, the column 'fusion_transcript' is populated with the sequence of the fused genes "
                 "as assembled from the supporting reads. Specify the flag twice to also print the fusion "
                 "transcripts to the file containing discarded fusions (-O). Default: off "
             ),
         ),
         ToolInput(
             tag="peptide_sequence",
             input_type=Boolean(optional=True),
             prefix="-P",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When set, the column 'peptide_sequence' is populated with the sequence of the fused proteins "
                 "as assembled from the supporting reads. Specify the flag twice to also print the peptide "
                 "sequence to the file containing discarded fusions (-O). Default: off "
             ),
         ),
         ToolInput(
             tag="read_identifiers",
             input_type=Boolean(optional=True),
             prefix="-I",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When set, the column 'read_identifiers' is populated with identifiers of the reads which "
                 "support the fusion. The identifiers are separated by commas. Specify the flag twice to "
                 "also print the read identifiers to the file containing discarded fusions (-O). Default: off "
             ),
         ),
         # ToolInput(
         #   tag="help",
         #   input_type=Boolean(optional=True),
         #   prefix="-h",
         #   separate_value_from_prefix=True,
         #   doc=InputDocumentation(doc="Print help and exit."),
         # ),
     ]
Beispiel #23
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             tag="inp",
             input_type=Bam(),
             prefix="--INPUT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-I) Input file (SAM or BAM) to extract reads from. Required."
             ),
         ),
         ToolInput(
             tag="outputFilename",
             input_type=Filename(
                 prefix=InputSelector("inp", remove_file_extension=True),
                 extension=".bam",
             ),
             prefix="--OUTPUT",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-O) Output file (SAM or BAM) to write extracted reads to. Required."
             ),
         ),
         # since gatk 4.1.4.0 the reference option replace with reference dictionary
         ToolInput(
             tag="sequence_dictionary",
             # tag="reference",
             input_type=File(),
             # prefix="--REFERENCE",
             prefix="--SEQUENCE_DICTIONARY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 "A Sequence Dictionary for the OUTPUT file (can be read from one of the following file types (SAM, BAM, VCF, BCF, Interval List, Fasta, or Dict)"
             ),
             # doc=InputDocumentation(
             #     doc="(-R) Reference sequence to reorder reads to match. A sequence dictionary corresponding to the reference fasta is required.  Create one with CreateSequenceDictionary.  Required. "
             # ),
         ),
         ToolInput(
             tag="allow_contig_length_discordance",
             input_type=Boolean(optional=True),
             prefix="--ALLOW_CONTIG_LENGTH_DISCORDANCE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-U)  If true, then permits mapping from a read contig to a new reference contig with the same name but a different length.  Highly dangerous, only use if you know what you are doing.  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="allow_incomplete_dict_concordance",
             input_type=Boolean(optional=True),
             prefix="--ALLOW_INCOMPLETE_DICT_CONCORDANCE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-S)  If true, then allows only a partial overlap of the original contigs with the new reference sequence contigs.  By default, this tool requires a corresponding contig in the new reference for each read contig  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="arguments_file",
             input_type=File(optional=True),
             prefix="--arguments_file",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="read one or more arguments files and add them to the command line This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         # ToolInput(
         #     tag="compression_level",
         #     input_type=Int(optional=True),
         #     prefix="--COMPRESSION_LEVEL",
         #     separate_value_from_prefix=True,
         #     doc=InputDocumentation(
         #         doc="Compression level for all compressed files created (e.g. BAM and VCF). Default value: 2."
         #     ),
         # ),
         ToolInput(
             tag="create_index",
             input_type=Boolean(optional=True),
             default=True,
             prefix="--CREATE_INDEX",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Whether to create a BAM index when writing a coordinate-sorted BAM file. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="create_md5_file",
             input_type=Boolean(optional=True),
             prefix="--CREATE_MD5_FILE",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Whether to create an MD5 digest for any BAM or FASTQ files created. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="ga4gh_client_secrets",
             input_type=Boolean(optional=True),
             prefix="--GA4GH_CLIENT_SECRETS",
             separate_value_from_prefix=True,
             doc=InputDocumentation(doc="Default value: client_secrets.json."),
         ),
         ToolInput(
             tag="help",
             input_type=Boolean(optional=True),
             prefix="--help",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-h) display the help message Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="max_records_in_ram",
             input_type=Int(optional=True),
             prefix="--MAX_RECORDS_IN_RAM",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="When writing files that need to be sorted, this will specify the number of records stored in RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort the file, and increases the amount of RAM needed.  Default value: 500000. "
             ),
         ),
         ToolInput(
             tag="quiet",
             input_type=Boolean(optional=True),
             prefix="--QUIET",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Whether to suppress job-summary info on System.err. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="tmp_dir",
             input_type=File(optional=True),
             prefix="--TMP_DIR",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="One or more directories with space available to be used by this program for temporary storage of working files  This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         ToolInput(
             tag="use_jdk_deflater",
             input_type=Boolean(optional=True),
             prefix="--USE_JDK_DEFLATER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-use_jdk_deflater)  Use the JDK Deflater instead of the Intel Deflater for writing compressed output  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="use_jdk_inflater",
             input_type=Boolean(optional=True),
             prefix="--USE_JDK_INFLATER",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-use_jdk_inflater)  Use the JDK Inflater instead of the Intel Inflater for reading compressed input  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="validation_stringency",
             input_type=Boolean(optional=True),
             prefix="--VALIDATION_STRINGENCY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=" Validation stringency for all SAM files read by this program.  Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.  Default value: STRICT. Possible values: {STRICT, LENIENT, SILENT} "
             ),
         ),
         ToolInput(
             tag="verbosity",
             input_type=Boolean(optional=True),
             prefix="--VERBOSITY",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Control verbosity of logging. Default value: INFO. Possible values: {ERROR, WARNING, INFO, DEBUG} "
             ),
         ),
         ToolInput(
             tag="version",
             input_type=Boolean(optional=True),
             prefix="--version",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="display the version number for this tool Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="showhidden",
             input_type=Boolean(optional=True),
             prefix="--showHidden",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-showHidden)  display hidden arguments  Default value: false. Possible values: {true, false} "
             ),
         ),
     ]
Beispiel #24
0
class MantaBase(IlluminaToolBase, ABC):
    def tool(self):
        return "manta"

    def base_command(self):
        return None

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def inputs(self) -> List[ToolInput]:
        return [*self.config_inputs, *self.running_inputs]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("python",
                       File(),
                       glob=InputSelector("runDir") + "/runWorkflow.py"),
            ToolOutput(
                "pickle",
                File(),
                glob=InputSelector("runDir") + "/runWorkflow.py.config.pickle",
            ),
            ToolOutput(
                "candidateSV",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/candidateSV.vcf.gz",
            ),
            ToolOutput(
                "candidateSmallIndels",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/candidateSmallIndels.vcf.gz",
            ),
            ToolOutput(
                "diploidSV",
                VcfTabix(),
                glob=InputSelector("runDir") +
                "/results/variants/diploidSV.vcf.gz",
            ),
            ToolOutput(
                "alignmentStatsSummary",
                File(),
                glob=InputSelector("runDir") +
                "/results/stats/alignmentStatsSummary.txt",
            ),
            ToolOutput(
                "svCandidateGenerationStats",
                Tsv(),
                glob=InputSelector("runDir") +
                "/results/stats/svCandidateGenerationStats.tsv",
            ),
            ToolOutput(
                "svLocusGraphStats",
                Tsv(),
                glob=InputSelector("runDir") +
                "/results/stats/svLocusGraphStats.tsv",
            ),
        ]

    def arguments(self) -> List[ToolArgument]:
        return [
            ToolArgument("configManta.py", position=0, shell_quote=False),
            ToolArgument(
                StringFormatter(";") + InputSelector("runDir") +
                "/runWorkflow.py",
                position=2,
                shell_quote=False,
            ),
            ToolArgument(
                CpuSelector(None),
                position=3,
                shell_quote=False,
                prefix="-j",
                doc="(-j) number of jobs, must be an integer or 'unlimited' "
                "(default: Estimate total cores on this node for local mode, 128 for sge mode)",
            ),
        ]

    @abstractmethod
    def container(self):
        raise Exception("Strelka version must override docker command")

    def friendly_name(self):
        return "Manta"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2019, 2, 12),
            dateUpdated=date(2019, 2, 19),
            institution="Illumina",
            doi=" doi:10.1093/bioinformatics/btv710",
            citation=
            "Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and "
            "cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710",
            keywords=["illumina", "manta", "variant caller"],
            documentationUrl="https://github.com/Illumina/manta",
            documentation="""
Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. 
It is optimized for analysis of germline variation in small sets of individuals and somatic 
variation in tumor/normal sample pairs. Manta discovers, assembles and scores large-scale SVs, 
medium-sized indels and large insertions within a single efficient workflow. The method is 
designed for rapid analysis on standard compute hardware: NA12878 at 50x genomic coverage is 
analyzed in less than 20 minutes on a 20 core server, and most WGS tumor/normal analyses 
can be completed within 2 hours. Manta combines paired and split-read evidence during SV 
discovery and scoring to improve accuracy, but does not require split-reads or successful 
breakpoint assemblies to report a variant in cases where there is strong evidence otherwise. 

It provides scoring models for germline variants in small sets of diploid samples and somatic 
variants in matched tumor/normal sample pairs. There is experimental support for analysis of 
unmatched tumor samples as well. Manta accepts input read mappings from BAM or CRAM files and 
reports all SV and indel inferences in VCF 4.1 format. See the user guide for a full description 
of capabilities and limitations.""".strip(),
        )

    config_inputs = [
        ToolInput(
            "config",
            File(optional=True),
            prefix="--config",
            position=1,
            shell_quote=False,
            doc=
            "provide a configuration file to override defaults in global config file "
            "(/opt/conda/share/manta-1.2.1-0/bin/configManta.py.ini)",
        ),
        ToolInput(
            "bam",
            BamBai(),
            prefix="--bam",
            position=1,
            shell_quote=False,
            doc=
            "FILE Normal sample BAM or CRAM file. May be specified more than once, multiple inputs "
            "will be treated as each BAM file representing a different sample. [optional] (no default)",
        ),
        ToolInput(
            "runDir",
            Filename(),
            prefix="--runDir",
            position=1,
            shell_quote=False,
            doc=
            "Run script and run output will be written to this directory [required] "
            "(default: MantaWorkflow)",
        ),
        ToolInput(
            "reference",
            FastaWithDict(),
            prefix="--referenceFasta",
            position=1,
            shell_quote=False,
            doc="samtools-indexed reference fasta file [required]",
        ),
        ToolInput(
            "tumorBam",
            BamBai(optional=True),
            prefix="--tumorBam",
            position=1,
            shell_quote=False,
            doc=
            "Tumor sample BAM or CRAM file. Only up to one tumor bam file accepted. [optional=null]",
        ),
        ToolInput(
            "exome",
            Boolean(optional=True),
            prefix="--exome",
            position=1,
            shell_quote=False,
            doc="Set options for WES input: turn off depth filters",
        ),
        ToolInput(
            "rna",
            Bam(optional=True),
            prefix="--rna",
            position=1,
            shell_quote=False,
            doc=
            "Set options for RNA-Seq input. Must specify exactly one bam input file",
        ),
        ToolInput(
            "unstrandedRNA",
            File(optional=True),
            prefix="--unstrandedRNA",
            position=1,
            shell_quote=False,
            doc=
            "Set if RNA-Seq input is unstranded: Allows splice-junctions on either strand",
        ),
        ToolInput(
            "outputContig",
            File(optional=True),
            prefix="--outputContig",
            position=1,
            shell_quote=False,
            doc="Output assembled contig sequences in VCF file",
        ),
        ToolInput(
            "callRegions",
            BedTabix(optional=True),
            prefix="--callRegions",
            position=1,
            shell_quote=False,
            doc=
            "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
            "regions to call. No VCF output will be provided outside of these regions. The full "
            "genome will still be used to estimate statistics from the input (such as expected depth "
            "per chromosome). Only one BED file may be specified. (default: call the entire genome)",
        ),
    ]

    running_inputs = [
        ToolInput(
            "mode",
            String(optional=True),
            default="local",
            prefix="--mode",
            position=3,
            shell_quote=False,
            doc="(-m) select run mode (local|sge)",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--quiet",
            position=3,
            shell_quote=False,
            doc="Don't write any log output to stderr "
            "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
        ),
        ToolInput(
            "queue",
            String(optional=True),
            prefix="--queue",
            position=3,
            shell_quote=False,
            doc="(-q) specify scheduler queue name",
        ),
        ToolInput(
            "memgb",
            Int(optional=True),
            prefix="--memGb",
            position=3,
            shell_quote=False,
            doc=
            "(-g) gigabytes of memory available to run workflow -- only meaningful in local mode, "
            "must be an integer (default: Estimate the total memory for this node for local  mode, "
            "'unlimited' for sge mode)",
        ),
        # ToolInput("dryRun", Boolean(optional=True), prefix="--dryRun", position=3, shell_quote=False,
        #           doc="(-d) dryRun workflow code without actually running command - tasks"),
        ToolInput(
            "maxTaskRuntime",
            String(optional=True),
            prefix="--maxTaskRuntime",
            position=3,
            shell_quote=False,
            doc=
            "(format: hh:mm:ss) Specify scheduler max runtime per task, argument is "
            "provided to the 'h_rt' resource limit if using SGE (no default)",
        ),
    ]
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf"),
             prefix="OUTPUT=",
             separate_value_from_prefix=False,
             doc="(O=) VCF structural variation calls. Required.",
         ),
         ToolInput(
             "reference",
             FastaWithDict(),
             prefix="REFERENCE_SEQUENCE=",
             separate_value_from_prefix=False,
         ),
         ToolInput(
             "bams",
             Array(Bam()),
             prefix="INPUT=",
             separate_value_from_prefix=False,
             prefix_applies_to_all_elements=True,
             doc=
             "(I=File Coordinate-sorted input BAM file. Default value: null. "
             "This option may be specified 0 or more times.",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembled", extension=".bam"),
             prefix="ASSEMBLY=",
             separate_value_from_prefix=False,
             doc=
             "Breakend assemblies which have undergone split read identification Required.",
         ),
         ToolInput(
             "inputLabel",
             String(optional=True),
             prefix="INPUT_LABEL=",
             separate_value_from_prefix=False,
             doc=
             "Input label. Variant calling evidence breakdowns are reported for each label. Default "
             "labels correspond to INPUT filenames. When specifying labels, labels must be provided for "
             "all input files. Default value: null. This option may be specified 0 or more times.",
         ),
         ToolInput(
             "inputMaxFragmentSize",
             Int(optional=True),
             prefix="INPUT_MAX_FRAGMENT_SIZE=",
             separate_value_from_prefix=False,
             doc=
             "Per input maximum concordant fragment size. Default value: null. "
             "This option may be specified 0 or more times.",
         ),
         ToolInput(
             "inputMinFragmentSize",
             Int(optional=True),
             prefix="INPUT_MIN_FRAGMENT_SIZE=",
             separate_value_from_prefix=False,
             doc=
             "Per input minimum concordant fragment size. Default value: null. "
             "This option may be specified 0 or more times.",
         ),
         ToolInput(
             "readPairConcordantPercent",
             Float(optional=True),
             prefix="READ_PAIR_CONCORDANT_PERCENT=",
             separate_value_from_prefix=False,
             doc=
             "Percent of read pairs considered concorant (0.0-1.0). If this is unset, the SAM proper "
             "pair flag is used to determine whether a read is discordantly aligned. Explicit fragment "
             "size specification overrides this setting. Default value: 0.995. "
             "This option can be set to 'null' to clear the default value.",
         ),
         ToolInput(
             "blacklist",
             Bed(),
             prefix="BLACKLIST=",
             separate_value_from_prefix=False,
             doc=
             "(BL=File) BED blacklist of regions to ignore. Assembly of regions such as high-coverage "
             "centromeric repeats is slow, and if such regions are to be filtered in downstream "
             "analysis anyway, blacklisting those region will improve runtime performance. "
             "For human WGS, the ENCODE DAC blacklist is recommended. Default value: null.",
         ),
         ToolInput(
             "configurationFile",
             File(optional=True),
             prefix="CONFIGURATION_FILE=",
             separate_value_from_prefix=False,
             doc=
             "(C=File) gridss configuration file containing overrides Default value: null.",
         ),
         ToolInput(
             "workerThreads",
             Int(optional=True),
             prefix="WORKER_THREADS=",
             separate_value_from_prefix=False,
             doc=
             "(THREADS=Integer  Number of worker threads to spawn. Defaults to number of cores available. "
             "Note that I/O threads are not included in this worker thread count so CPU usage can be "
             "higher than the number of worker thread. Default value: 6. "
             "This option can be set to 'null' to clear the default value.",
         ),
         ToolInput(
             "workingDir",
             String(optional=True),
             prefix="WORKING_DIR=",
             default=".",
             separate_value_from_prefix=False,
             doc=
             "Directory to place intermediate results directories. Default location is the same "
             "directory as the associated input or output file. Default value: null.",
         ),
         ToolInput(
             "ignoreDuplicates",
             Boolean(optional=True),
             prefix="IGNORE_DUPLICATES=",
             separate_value_from_prefix=False,
             doc=
             "Ignore reads marked as duplicates. Default value: true. This option can be set to 'null' "
             "to clear the default value. Possible values: {true, false}",
         ),
     ]
Beispiel #26
0
 def tests(self):
     return [
         TTestCase(
             name="basic",
             input={
                 "reads": [
                     os.path.join(
                         BioinformaticsTool.test_data_path(),
                         "wgsgermline_data",
                         "NA12878-BRCA1_R1.fastq.gz",
                     ),
                     os.path.join(
                         BioinformaticsTool.test_data_path(),
                         "wgsgermline_data",
                         "NA12878-BRCA1_R2.fastq.gz",
                     ),
                 ],
                 "reference":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.chr17.fasta",
                 ),
                 "markShorterSplits":
                 True,
                 "readGroupHeaderLine":
                 "@RG\tID:NA12878-BRCA1\tSM:NA12878-BRCA1\tLB:NA12878-BRCA1\tPL:ILLUMINA",
                 "threads":
                 16,
             },
             output=Bam.basic_test(
                 "out",
                 8628527,
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "NA12878-BRCA1.bwamem.flagstat",
                 ),
             ),
         ),
         TTestCase(
             name="minimal",
             input={
                 "reads": [
                     os.path.join(
                         BioinformaticsTool.test_data_path(),
                         "wgsgermline_data",
                         "NA12878-BRCA1_R1.fastq.gz",
                     ),
                     os.path.join(
                         BioinformaticsTool.test_data_path(),
                         "wgsgermline_data",
                         "NA12878-BRCA1_R2.fastq.gz",
                     ),
                 ],
                 "reference":
                 os.path.join(
                     BioinformaticsTool.test_data_path(),
                     "wgsgermline_data",
                     "Homo_sapiens_assembly38.chr17.fasta",
                 ),
                 "markShorterSplits":
                 True,
                 "readGroupHeaderLine":
                 "@RG\tID:NA12878-BRCA1\tSM:NA12878-BRCA1\tLB:NA12878-BRCA1\tPL:ILLUMINA",
                 "threads":
                 16,
             },
             output=self.minimal_test(),
         ),
     ]
Beispiel #27
0
 def outputs(self) -> List[ToolOutput]:
     return [
         ToolOutput("out", Bam(), glob=InputSelector("outputFilename")),
         # ToolOutput("skippedReads", File(optional=True), glob=InputSelector("skippedReadsOutputFilename"))
     ]
Beispiel #28
0
 def outputs(self):
     return [ToolOutput("out", Stdout(Bam()))]