Example #1
0
        class Advanced:
            """Options."""

            dirs = BooleanField(
                label="--dirs",
                default=True,
                description="Prepend directory to sample names.",
            )

            dirs_depth = IntegerField(
                label="--dirs-depth",
                default=-1,
                description="Prepend a specified number of directories to sample names. Enter a "
                "negative number (default) to take from start of path.",
            )

            fullnames = BooleanField(
                label="--fullnames",
                default=False,
                description="Disable the sample name cleaning (leave as full file name).",
            )

            config = BooleanField(
                label="Use configuration file",
                default=True,
                description="Use Genialis configuration file for MultiQC report.",
            )

            cl_config = StringField(
                label="--cl-config",
                required=False,
                description="Enter text with command-line configuration options to override the "
                "defaults (e.g. custom_logo_url: https://www.genialis.com).",
            )
Example #2
0
    class Input:
        """Input fields for BsConversionRate."""

        mr = DataField(
            "alignment:bam:walt",
            label="Aligned reads from bisulfite sequencing",
            description="Bisulfite specifc alignment such as WALT is required as .mr file type is used. Duplicates"
            "should be removed to reduce any bias introduced by incomplete conversion on PCR duplicate"
            "reads.",
        )
        skip = BooleanField(
            label="Skip Bisulfite conversion rate step",
            description="Bisulfite conversion rate step can be skipped.",
            default=False,
        )
        sequence = DataField(
            "seq:nucleotide",
            label="Unmethylated control sequence",
            description="Separate unmethylated control sequence FASTA file is required to estimate bisulfite"
            "conversion rate.",
            required=False,
        )
        count_all = BooleanField(
            label="Count all cytosines including CpGs", default=True
        )
        read_length = IntegerField(label="Average read length", default=150)
        max_mismatch = FloatField(
            label="Maximum fraction of mismatches", required=False
        )
        a_rich = BooleanField(label="Reads are A-rich", default=False)
Example #3
0
    class Input:
        """Input fields for InsertSizeMetrics."""

        bam = DataField("alignment:bam", label="Alignment BAM file")
        genome = DataField("seq:nucleotide", label="Genome")

        minimum_fraction = FloatField(
            label="Minimum fraction of reads in a category to be considered ",
            description="When generating the histogram, discard any data "
            "categories (out of FR, TANDEM, RF) that have fewer than this "
            "fraction of overall reads (Range: 0 and 0.5).",
            default=0.05,
        )

        include_duplicates = BooleanField(
            label=
            "Include reads marked as duplicates in the insert size histogram",
            default=False,
        )

        deviations = FloatField(
            label="Deviations limit",
            description=
            "Generate mean, standard deviation and plots by trimming "
            "the data down to MEDIAN + DEVIATIONS*MEDIAN_ABSOLUTE_DEVIATION. "
            "This is done because insert size data typically includes enough "
            "anomalous values from chimeras and other artifacts to make the "
            "mean and standard deviation grossly misleading regarding the real "
            "distribution.",
            default=10.0,
        )

        validation_stringency = StringField(
            label="Validation stringency",
            description="Validation stringency for all SAM files read by this "
            "program. Setting stringency to SILENT can improve "
            "performance when processing a BAM file in which "
            "variable-length data (read, qualities, tags) do not "
            "otherwise need to be decoded. Default is STRICT.",
            choices=[
                ("STRICT", "STRICT"),
                ("LENIENT", "LENIENT"),
                ("SILENT", "SILENT"),
            ],
            default="STRICT",
        )

        assume_sorted = BooleanField(
            label="Sorted BAM file",
            description=
            "If True, the sort order in the header file will be ignored.",
            default=False,
        )
Example #4
0
    class Input:
        """Input fields for AlignmentSummary."""

        bam = DataField("alignment:bam", label="Alignment BAM file")
        genome = DataField("seq:nucleotide", label="Genome")

        adapters = DataField("seq:nucleotide",
                             label="Adapter sequences",
                             required=False)

        validation_stringency = StringField(
            label="Validation stringency",
            description="Validation stringency for all SAM files read by this "
            "program. Setting stringency to SILENT can improve "
            "performance when processing a BAM file in which "
            "variable-length data (read, qualities, tags) do not "
            "otherwise need to be decoded. Default is STRICT.",
            choices=[
                ("STRICT", "STRICT"),
                ("LENIENT", "LENIENT"),
                ("SILENT", "SILENT"),
            ],
            default="STRICT",
        )

        insert_size = IntegerField(label="Maximum insert size", default=100000)

        pair_orientation = StringField(
            label="Pair orientation",
            default="null",
            choices=[
                ("null", "Unspecified"),
                ("FR", "FR"),
                ("RF", "RF"),
                ("TANDEM", "TANDEM"),
            ],
        )

        bisulfite = BooleanField(
            label="BAM file consists of bisulfite sequenced reads",
            default=False)

        assume_sorted = BooleanField(
            label="Sorted BAM file",
            description=
            "If true the sort order in the header file will be ignored.",
            default=False,
        )
    class Input:
        """Input fields to process ClusterTimeCourse."""

        expressions = ListField(
            DataField("expression"),
            relation_type="series",
            label="Time series relation",
            description=
            "Select time course to which the expressions belong to.",
        )
        genes = ListField(
            StringField(),
            label="Gene subset",
            required=False,
            description="Select at least two genes or leave this field empty.",
        )
        gene_species = StringField(
            label="Species",
            description="Species to which the selected genes belong to. "
            "This field is required if gene subset is set.",
            required=False,
            hidden="!genes",
            allow_custom_choice=True,
            choices=[
                ("Dictyostelium discoideum", "Dictyostelium discoideum"),
                ("H**o sapiens", "H**o sapiens"),
                ("Macaca mulatta", "Macaca mulatta"),
                ("Mus musculus", "Mus musculus"),
                ("Rattus norvegicus", "Rattus norvegicus"),
            ],
        )
        gene_source = StringField(
            label="Gene ID database of selected genes",
            description="This field is required if gene subset is set.",
            required=False,
            hidden="!genes",
        )
        distance = StringField(
            label="Distance metric",
            choices=[
                ("spearman", "Spearman"),
                ("pearson", "Pearson"),
            ],
            default="spearman",
        )
        linkage = StringField(
            label="Linkage method",
            choices=[
                ("single", "single"),
                ("average", "average"),
                ("complete", "complete"),
            ],
            default="average",
        )
        ordering = BooleanField(
            label="Use optimal ordering",
            description="Results in a more intuitive tree structure, "
            "but may slow down the clustering on large datasets",
            default=False,
        )
Example #6
0
    class Input:
        """Input fields to process WgsPreprocess."""

        reads = DataField("reads:fastq:paired", label="Input sample")
        ref_seq = DataField("seq:nucleotide", label="Reference sequence")
        bwa_index = DataField("index:bwa", label="BWA genome index")
        known_sites = ListField(DataField("variants:vcf"),
                                label="Known sites of variation (VCF)")

        advanced = BooleanField(
            label="Show advanced options",
            description="Inspect and modify parameters.",
            default=False,
        )

        class AdvancedOptions:
            """Advanced options."""

            pixel_distance = IntegerField(
                label="--OPTICAL_DUPLICATE_PIXEL_DISTANCE",
                default=2500,
                description="Set the optical pixel distance, e.g. "
                "distance between clusters. Modify this parameter to "
                "ensure compatibility with older Illumina platforms.",
            )

        advanced_options = GroupField(AdvancedOptions,
                                      label="Advanced options",
                                      hidden="!advanced")
Example #7
0
        class InsertSizeMetrics:
            """InsertSizeMetrics parameters."""

            minimum_fraction = FloatField(
                label="Minimum fraction of reads in a category to be considered",
                default=0.05,
                description="When generating the histogram, discard any data "
                "categories (out of FR, TANDEM, RF) that have fewer than "
                "this fraction of overall reads (Range: 0 and 0.5).",
            )

            include_duplicates = BooleanField(
                label="Include reads marked as duplicates in the insert size histogram",
                default=False,
            )

            deviations = FloatField(
                label="Deviations limit",
                default=10.0,
                description="Generate mean, standard deviation and plots "
                "by trimming the data down to MEDIAN + DEVIATIONS * "
                "MEDIAN_ABSOLUTE_DEVIATION. This is done because insert "
                "size data typically includes enough anomalous values "
                "from chimeras and other artifacts to make the mean and "
                "standard deviation grossly misleading regarding the real "
                "distribution.",
            )
Example #8
0
    class Input:
        """Input fields for SlamdunkAllPaired."""

        reads = DataField("reads:fastq:paired", label="Reads")
        ref_seq = DataField("seq:nucleotide", label="FASTA file")
        regions = DataField(
            "bed", label="BED file with coordinates of regions of interest"
        )

        filter_multimappers = BooleanField(
            label="Filter multimappers",
            description="If true filter and reasign multimappers based on provided BED file with regions of interest",
            default=True,
        )

        max_alignments = IntegerField(
            label="Maximum number of multimapper alignments",
            description="The maximum number of alignments that will be reported for a multi-mapping read (i.e. reads"
            "with multiple alignments of equal best scores)",
            default=1,
        )

        read_length = IntegerField(
            label="Maximum read length",
            description="Maximum length of reads in the input FASTQ file",
            default=150,
        )
Example #9
0
    class Input:
        """Input fields to process ImportSra."""

        sra_accession = ListField(StringField(), label="SRA accession(s)")
        show_advanced = BooleanField(label="Show advanced options", default=False)

        class Advanced:
            """Advanced options."""

            prefetch = BooleanField(label="Prefetch SRA file", default=True)
            max_size_prefetch = StringField(
                label="Maximum file size to download in KB",
                default="20G",
                description="A unit prefix can be used instead of a value in KB (e.g. 1024M or 1G).",
            )
            min_spot_id = IntegerField(label="Minimum spot ID", required=False)
            max_spot_id = IntegerField(label="Maximum spot ID", required=False)
            min_read_len = IntegerField(label="Minimum read length", required=False)
            clip = BooleanField(label="Clip adapter sequences", default=False)
            aligned = BooleanField(label="Dump only aligned sequences", default=False)
            unaligned = BooleanField(
                label="Dump only unaligned sequences", default=False
            )

        advanced = GroupField(
            Advanced, label="Advanced options", hidden="!show_advanced"
        )
Example #10
0
    class Input:
        """Input fields."""

        gse_accession = StringField(
            label="GEO accession",
            description="Enter a GEO series accession number.")
        show_advanced = BooleanField(label="Show advanced options",
                                     default=False)

        class Advanced:
            """Advanced options."""

            prefetch = BooleanField(label="Prefetch SRA file", default=True)
            max_size_prefetch = StringField(
                label="Maximum file size to download in KB",
                default="20G",
                description=
                "A unit prefix can be used instead of a value in KB (e.g. 1024M or 1G).",
            )
            min_spot_id = IntegerField(label="Minimum spot ID", required=False)
            max_spot_id = IntegerField(label="Maximum spot ID", required=False)
            min_read_len = IntegerField(label="Minimum read length",
                                        required=False)
            clip = BooleanField(label="Clip adapter sequences", default=False)
            aligned = BooleanField(label="Dump only aligned sequences",
                                   default=False)
            unaligned = BooleanField(label="Dump only unaligned sequences",
                                     default=False)
            mapping_file = FileField(
                label="File with probe ID mappings",
                description=
                "The file should be tab-separated and contain two columns with their column names. The "
                "first column should contain Gene IDs and the second one should contain probe names. Supported file "
                "extensions are .tab.*, .tsv.*, .txt.*",
                required=False,
            )
            source = StringField(
                label="Gene ID source",
                description=
                "Gene ID source used for probe mapping is required when using a custom file.",
                allow_custom_choice=True,
                required=False,
                choices=[
                    ("AFFY", "AFFY"),
                    ("DICTYBASE", "DICTYBASE"),
                    ("ENSEMBL", "ENSEMBL"),
                    ("NCBI", "NCBI"),
                    ("UCSC", "UCSC"),
                ],
            )
            build = StringField(
                label="Genome build",
                description=
                "Genome build of mapping file is required when using a custom file.",
                required=False,
            )

        advanced = GroupField(Advanced,
                              label="Advanced options",
                              hidden="!show_advanced")
Example #11
0
    class Input:
        """Input fields for GatkHaplotypeCallerGvcf."""

        bam = DataField("alignment:bam", label="Analysis ready BAM file")
        ref_seq = DataField("seq:nucleotide", label="Reference sequence")

        advanced = BooleanField(
            label="Show advanced options",
            description="Inspect and modify parameters.",
            default=False,
        )

        class Options:
            """Options."""

            intervals = DataField(
                "bed",
                label=
                "Use intervals BED file to limit the analysis to the specified parts of the genome.",
                required=False,
            )

            contamination = FloatField(
                label="Contamination fraction",
                default=0,
                description=
                "Fraction of contamination in sequencing data (for all samples) to aggressively remove.",
            )

        options = GroupField(Options, label="Options", hidden="!advanced")
Example #12
0
    class Input:
        """Input fields for SlamdunkAllPaired."""

        reads = DataField('reads:fastq:paired', label='Reads')
        transcriptome = DataField(
            'seq:nucleotide',
            label='FASTA file containig sequences for alingnig.')
        regions = DataField(
            'bed', label='BED file with coordinates of regions of interest.')

        filter_multimappers = BooleanField(
            label='Filter multimappers',
            description=
            'If true filter and reasign multimappers based on provided BED file with regions of interest.',
            default=True)

        max_alignments = IntegerField(
            label='Maximum number of multimapper alignments',
            description=
            'The maximum number of alignments that will be reported for a multi-mapping read (i.e. reads'
            'with multiple alignments of equal best scores).',
            default=1)

        read_length = IntegerField(
            label='Maximum read length',
            description='Maximul length of reads in the input FASTQ file.',
            default=150)
Example #13
0
        class Options:
            """Options."""

            min_map_quality = IntegerField(
                label=
                "Minimum mapping quality for a read to contribute coverage",
                default=20,
            )

            min_quality = IntegerField(
                label="Minimum base quality for a base to contribute coverage",
                description=
                "N bases will be treated as having a base quality of "
                "negative infinity and will therefore be excluded from coverage "
                "regardless of the value of this parameter.",
                default=20,
            )

            coverage_cap = IntegerField(
                label="Maximum coverage cap",
                description=
                "Treat positions with coverage exceeding this value as "
                "if they had coverage at this set value.",
                default=250,
            )

            accumulation_cap = IntegerField(
                label="Ignore positions with coverage above this value",
                description="At positions with coverage exceeding this value, "
                "completely ignore reads that accumulate beyond this value",
                default=100000,
            )

            count_unpaired = BooleanField(
                label=
                "Count unpaired reads and paired reads with one end unmapped",
                default=False,
            )

            sample_size = IntegerField(
                label=
                "Sample Size used for Theoretical Het Sensitivity sampling",
                default=10000,
            )

            validation_stringency = StringField(
                label="Validation stringency",
                description=
                "Validation stringency for all SAM files read by this "
                "program. Setting stringency to SILENT can improve "
                "performance when processing a BAM file in which "
                "variable-length data (read, qualities, tags) do not "
                "otherwise need to be decoded. Default is STRICT.",
                choices=[
                    ("STRICT", "STRICT"),
                    ("LENIENT", "LENIENT"),
                    ("SILENT", "SILENT"),
                ],
                default="STRICT",
            )
Example #14
0
        class AdvancedOptions:
            """Advanced options."""

            batch_size = IntegerField(
                label="Batch size",
                default=0,
                description="Batch size controls the number of samples "
                "for which readers are open at once and therefore provides "
                "a way to minimize memory consumption. However, it can "
                "take longer to complete. Use the consolidate flag if more "
                "than a hundred batches were used. This will improve feature "
                "read time. batchSize=0 means no batching "
                "(i.e. readers for all samples will be opened at once).",
            )

            consolidate = BooleanField(
                label="Consolidate",
                default=False,
                description="Boolean flag to enable consolidation. If "
                "importing data in batches, a new fragment is created for "
                "each batch. In case thousands of fragments are created, "
                "GenomicsDB feature readers will try to open ~20x as many "
                "files. Also, internally GenomicsDB would consume more "
                "memory to maintain bookkeeping data from all fragments. "
                "Use this flag to merge all fragments into one. Merging "
                "can potentially improve read performance, however overall "
                "benefit might not be noticeable as the top Java layers "
                "have significantly higher overheads. This flag has no "
                "effect if only one batch is used.",
            )
Example #15
0
        class Advanced:
            """Advanced options."""

            prefetch = BooleanField(label="Prefetch SRA file", default=True)
            max_size_prefetch = StringField(
                label="Maximum file size to download in KB",
                default="20G",
                description="A unit prefix can be used instead of a value in KB (e.g. 1024M or 1G).",
            )
            min_spot_id = IntegerField(label="Minimum spot ID", required=False)
            max_spot_id = IntegerField(label="Maximum spot ID", required=False)
            min_read_len = IntegerField(label="Minimum read length", required=False)
            clip = BooleanField(label="Clip adapter sequences", default=False)
            aligned = BooleanField(label="Dump only aligned sequences", default=False)
            unaligned = BooleanField(
                label="Dump only unaligned sequences", default=False
            )
Example #16
0
        class Advanced:
            """Add advanced list of options."""

            boolean_field2 = BooleanField(
                label="Labels are short and do not end in a period",
                description="Description ends in a period.",
                default=False,
            )
Example #17
0
    class Input:
        """Input fields to process ROSE2."""

        input_macs = DataField(
            "chipseq:callpeak",
            label="BED/narrowPeak file (MACS results)",
            required=False,
            hidden="input_upload",
        )
        input_upload = DataField(
            "bed",
            label="BED file (Upload)",
            required=False,
            hidden="input_macs || use_filtered_bam",
        )
        use_filtered_bam = BooleanField(
            label="Use Filtered BAM File",
            default=False,
            hidden="input_upload",
            description=("Use filtered BAM file from a MACS2 object to rank "
                         "enhancers by. Only applicable if input is MACS2."),
        )
        rankby = DataField(
            "alignment:bam",
            label="BAM file",
            required=False,
            hidden="use_filtered_bam",
            description="BAM file to rank enhancers by.",
        )
        control = DataField(
            "alignment:bam",
            label="Control BAM File",
            required=False,
            hidden="use_filtered_bam",
            description="BAM file to rank enhancers by.",
        )
        tss = IntegerField(
            label="TSS exclusion",
            default=0,
            description=
            "Enter a distance from TSS to exclude. 0 = no TSS exclusion.",
        )
        stitch = IntegerField(
            label="Stitch",
            required=False,
            description=(
                "Enter a max linking distance for stitching. If not "
                "given, optimal stitching parameter will be determined"
                " automatically."),
        )
        mask = DataField(
            "bed",
            label="Masking BED file",
            required=False,
            description=(
                "Mask a set of regions from analysis. Provide a BED of"
                " masking regions."),
        )
Example #18
0
    class Input:
        """Input fields to process ChipQC."""

        alignment = DataField(
            data_type="alignment:bam",
            label="Aligned reads",
        )
        peaks = DataField(
            data_type="chipseq:callpeak",
            label="Called peaks",
        )
        blacklist = DataField(
            data_type="bed",
            label="Blacklist regions",
            description="BED file containing genomic regions that should be "
            "excluded from the analysis.",
            required=False,
        )
        calculate_enrichment = BooleanField(
            label="Calculate enrichment",
            description="Calculate enrichment of signal in known genomic "
            "annotation. By default annotation is provided from "
            "the TranscriptDB package specified by genome bulid "
            "which should match one of the supported annotations "
            "(hg19, hg38, hg18, mm10, mm9, rn4, ce6, dm3). If "
            "annotation is not supported the analysis is skipped.",
            default=False,
        )

        class Advanced:
            """Add advanced list of options."""

            quality_threshold = IntegerField(
                label="Mapping quality threshold",
                description="Only reads with mapping quality scores above "
                "this threshold will be used for some statistics.",
                default=15,
            )
            profile_window = IntegerField(
                label="Window size",
                description="An integer indicating the width of the window "
                "used for peak profiles. Peaks will be centered "
                "on their summits and include half of the window "
                "size upstream and half downstream of this point.",
                default=400,
            )
            shift_size = StringField(
                label="Shift size",
                description="Vector of values to try when computing optimal "
                "shift sizes. It should be specifeird as "
                "consecutive numbers vector with start:end",
                default="1:300",
            )

        advanced = GroupField(
            Advanced,
            label="Advanced parameters",
        )
    class Input:
        """Input fields to process MarkDuplicates."""

        bam = DataField("alignment:bam", label="Alignment BAM file")
        skip = BooleanField(
            label="Skip MarkDuplicates step",
            description="MarkDuplicates step can be skipped.",
            default=False,
        )
        remove_duplicates = BooleanField(
            label="Remove duplicates",
            description="If true do not write duplicates to the output file "
            "instead of writing them with appropriate flags set.",
            default=False,
        )
        validation_stringency = StringField(
            label="Validation stringency",
            description="Validation stringency for all SAM files read by this "
            "program. Setting stringency to SILENT can improve "
            "performance when processing a BAM file in which "
            "variable-length data (read, qualities, tags) do not "
            "otherwise need to be decoded. Default is STRICT.",
            choices=[
                ("STRICT", "STRICT"),
                ("LENIENT", "LENIENT"),
                ("SILENT", "SILENT"),
            ],
            default="STRICT",
        )
        assume_sort_order = StringField(
            label="Assume sort order",
            description="If not null (default), assume that the input file "
            "has this order even if the header says otherwise."
            "Possible values are unsorted, queryname, coordinate "
            "and unknown.",
            choices=[
                ("", "as in BAM header (default)"),
                ("unsorted", "unsorted"),
                ("queryname", "queryname"),
                ("coordinate", "coordinate"),
                ("duplicate", "duplicate"),
                ("unknown", "unknown"),
            ],
            default="",
        )
Example #20
0
    class Input:
        """Input fields to process Bamclipper."""

        alignment = DataField('alignment:bam', label='Alignment BAM file')
        bedpe = DataField('bedpe', label='BEDPE file', required=False)
        skip = BooleanField(
            label='Skip Bamclipper step',
            description='Use this option to skip Bamclipper step.',
            default=False)
Example #21
0
        class Options:
            """Options."""

            beta_prior = BooleanField(
                label="Beta prior",
                default=False,
                description="Whether or not to put a zero-mean normal prior "
                "on the non-intercept coefficients.",
            )
Example #22
0
    class Input:
        """Input fields for GatkGenotypeGVCFs."""

        gvcfs = ListField(
            DataField("variants:gvcf"),
            label="Input data (GVCF)",
        )
        ref_seq = DataField("seq:nucleotide", label="Reference sequence")

        intervals = DataField(
            "bed",
            label="Intervals file (.bed)",
        )

        dbsnp = DataField("variants:vcf", label="dbSNP file")

        advanced = BooleanField(
            label="Show advanced options",
            description="Inspect and modify parameters.",
            default=False,
        )

        class AdvancedOptions:
            """Advanced options."""

            batch_size = IntegerField(
                label="Batch size",
                default=0,
                description="Batch size controls the number of samples "
                "for which readers are open at once and therefore provides "
                "a way to minimize memory consumption. However, it can "
                "take longer to complete. Use the consolidate flag if more "
                "than a hundred batches were used. This will improve feature "
                "read time. batchSize=0 means no batching "
                "(i.e. readers for all samples will be opened at once).",
            )

            consolidate = BooleanField(
                label="Consolidate",
                default=False,
                description="Boolean flag to enable consolidation. If "
                "importing data in batches, a new fragment is created for "
                "each batch. In case thousands of fragments are created, "
                "GenomicsDB feature readers will try to open ~20x as many "
                "files. Also, internally GenomicsDB would consume more "
                "memory to maintain bookkeeping data from all fragments. "
                "Use this flag to merge all fragments into one. Merging "
                "can potentially improve read performance, however overall "
                "benefit might not be noticeable as the top Java layers "
                "have significantly higher overheads. This flag has no "
                "effect if only one batch is used.",
            )

        advanced_options = GroupField(AdvancedOptions,
                                      label="Advanced options",
                                      hidden="!advanced")
Example #23
0
    class Input:
        """Input fields to process Bamclipper."""

        alignment = DataField("alignment:bam", label="Alignment BAM file")
        bedpe = DataField("bedpe", label="BEDPE file", required=False)
        skip = BooleanField(
            label="Skip Bamclipper step",
            description="Use this option to skip Bamclipper step.",
            default=False,
        )
Example #24
0
        class FilterOptions:
            """Filtering options."""

            count = BooleanField(
                label="Filter genes based on expression count",
                default=True,
            )
            min_count_sum = IntegerField(
                label="Minimum gene expression count summed over all samples",
                default=10,
                description="Filter genes in the expression matrix input. "
                "Remove genes where the expression count sum over all samples "
                "is below the threshold.",
                hidden="!filter_options.count",
            )
            cook = BooleanField(
                label="Filter genes based on Cook's distance",
                default=False,
            )
            cooks_cutoff = FloatField(
                label="Threshold on Cook's distance",
                required=False,
                description="If one or more samples have Cook's distance "
                "larger than the threshold set here, the p-value for the row "
                "is set to NA. If left empty, the default threshold of 0.99 "
                "quantile of the F(p, m-p) distribution is used, where p is "
                "the number of coefficients being fitted and m is the number "
                "of samples. This test excludes Cook's distance of samples "
                "belonging to experimental groups with only two samples.",
                hidden="!filter_options.cook",
            )
            independent = BooleanField(
                label="Apply independent gene filtering",
                default=False,
            )
            alpha = FloatField(
                label="Significance cut-off used for optimizing independent "
                "gene filtering",
                default=0.1,
                description="The value should be set to adjusted p-value "
                "cut-off (FDR).",
                hidden="!filter_options.independent",
            )
Example #25
0
    class Input:
        """Input fields to process MarkDuplicates."""

        bam = DataField('alignment:bam', label='Alignment BAM file')
        skip = BooleanField(
            label='Skip MarkDuplicates step',
            description='MarkDuplicates step can be skipped.',
            default=False,
        )
        remove_duplicates = BooleanField(
            label='Remove duplicates',
            description='If true do not write duplicates to the output file '
            'instead of writing them with appropriate flags set.',
            default=False,
        )
        validation_stringency = StringField(
            label='Validation stringency',
            description='Validation stringency for all SAM files read by this '
            'program. Setting stringency to SILENT can improve '
            'performance when processing a BAM file in which '
            'variable-length data (read, qualities, tags) do not '
            'otherwise need to be decoded. Default is STRICT.',
            choices=[('STRICT', 'STRICT'), ('LENIENT', 'LENIENT'),
                     ('SILENT', 'SILENT')],
            default='STRICT',
        )
        assume_sort_order = StringField(
            label='Assume sort order',
            description='If not null (default), assume that the input file '
            'has this order even if the header says otherwise.'
            'Possible values are unsorted, queryname, coordinate '
            'and unknown.',
            choices=[('', 'as in BAM header (default)'),
                     ('unsorted', 'unsorted'), ('queryname', 'queryname'),
                     ('coordinate', 'coordinate'), ('duplicate', 'duplicate'),
                     ('unknown', 'unknown')],
            default='')
Example #26
0
    class Input:
        """Input fields for CollectRrbsMetrics."""

        bam = DataField("alignment:bam", label="Alignment BAM file")
        genome = DataField("seq:nucleotide", label="Genome")

        min_quality = IntegerField(
            label=
            "Threshold for base quality of a C base before it is considered",
            default=20,
        )

        next_base_quality = IntegerField(
            label=
            "Threshold for quality of a base next to a C before the C base is considered",
            default=10,
        )

        min_lenght = IntegerField(label="Minimum read length", default=5)

        mismatch_rate = FloatField(
            label=
            "Maximum fraction of mismatches in a read to be considered (Range: 0 and 1)",
            default=0.1,
        )

        validation_stringency = StringField(
            label="Validation stringency",
            description="Validation stringency for all SAM files read by this "
            "program. Setting stringency to SILENT can improve "
            "performance when processing a BAM file in which "
            "variable-length data (read, qualities, tags) do not "
            "otherwise need to be decoded. Default is STRICT.",
            choices=[
                ("STRICT", "STRICT"),
                ("LENIENT", "LENIENT"),
                ("SILENT", "SILENT"),
            ],
            default="STRICT",
        )

        assume_sorted = BooleanField(
            label="Sorted BAM file",
            description=
            "If true the sort order in the header file will be ignored.",
            default=False,
        )
Example #27
0
    class Input:
        """Input fields to process EdgeR."""

        case = ListField(
            DataField("expression"),
            label="Case",
            description="Case samples (replicates)",
        )
        control = ListField(
            DataField("expression"),
            label="Control",
            description="Control samples (replicates)",
        )

        count_filter = IntegerField(
            label="Raw counts filtering threshold",
            default=10,
            description="Filter genes in the expression matrix input. "
            "Remove genes where the number of counts in all samples is "
            "below the threshold.",
        )
        create_sets = BooleanField(
            label="Create gene sets",
            description="After calculating differential gene "
            "expressions create gene sets for up-regulated genes, "
            "down-regulated genes and all genes.",
            default=False,
        )
        logfc = FloatField(
            label="Log2 fold change threshold for gene sets",
            description="Genes above Log2FC are considered as "
            "up-regulated and genes below -Log2FC as down-regulated.",
            default=1.0,
            hidden="!create_sets",
        )
        fdr = FloatField(
            label="FDR threshold for gene sets",
            default=0.05,
            hidden="!create_sets",
        )
Example #28
0
        class QualityTrimming:
            """Quality trimming options."""

            quality = IntegerField(
                label="Quality cutoff",
                description=
                "Trim low-quality ends from reads based on phred score.",
                default=20,
            )
            nextseq = IntegerField(
                label="NextSeq/NovaSeq trim cutoff",
                description="NextSeq/NovaSeq-specific quality "
                "trimming. Trims also dark cycles appearing as "
                "high-quality G bases. This will set a specific "
                "quality cutoff, but qualities of G bases are ignored. "
                "This can not be used with Quality cutoff and will "
                "override it.",
                required=False,
            )
            phred = StringField(
                label="Phred score encoding",
                description="Use either ASCII+33 quality scores as "
                "Phred scores (Sanger/Illumina 1.9+ encoding) or "
                "ASCII+64 quality scores (Illumina 1.5 encoding) for "
                "quality trimming",
                choices=[
                    ("--phred33", "ASCII+33"),
                    ("--phred64", "ASCII+64"),
                ],
                default="--phred33",
            )
            min_length = IntegerField(
                label="Minimum length after trimming",
                description="Discard reads that became shorter than "
                "selected length because of either quality or adapter "
                "trimming. Both reads of a read-pair need to be longer "
                "than specified length to be printed out to validated "
                "paired-end files. If only one read became too short "
                "there is the possibility of keeping such unpaired "
                "single-end reads with Retain unpaired. A value of 0 "
                "disables filtering based on length.",
                default=20,
            )
            max_n = IntegerField(
                label="Maximum number of Ns",
                description="Read exceeding this limit will result in "
                "the entire pair being removed from the trimmed output "
                "files.",
                required=False,
            )
            retain_unpaired = BooleanField(
                label="Retain unpaired reads after trimming",
                description="If only one of the two paired-end reads "
                "became too short, the longer read will be written.",
                default=False,
            )
            unpaired_len_1 = IntegerField(
                label="Unpaired read length cutoff for mate 1",
                default=35,
                hidden="!quality_trim.retain_unpaired",
            )
            unpaired_len_2 = IntegerField(
                label="Unpaired read length cutoff for mate 2",
                default=35,
                hidden="!quality_trim.retain_unpaired",
            )
            clip_r1 = IntegerField(
                label="Trim bases from 5' end of read 1",
                description="This may be useful if the qualities were "
                "very poor, or if there is some sort of unwanted bias "
                "at the 5' end.",
                required=False,
            )
            clip_r2 = IntegerField(
                label="Trim bases from 5' end of read 2",
                description="This may be useful if the qualities were "
                "very poor, or if there is some sort of unwanted bias "
                "at the 5' end. For paired-end bisulfite sequencing, "
                "it is recommended to remove the first few bp because "
                "the end-repair reaction may introduce a bias towards "
                "low methylation.",
                required=False,
            )
            three_prime_r1 = IntegerField(
                label="Trim bases from 3' end of read 1",
                description="Remove bases from the 3' end of read 1 "
                "after adapter/quality trimming has been performed. "
                "This may remove some unwanted bias from the 3' end "
                "that is not directly related to adapter sequence or "
                "basecall quality.",
                required=False,
            )
            three_prime_r2 = IntegerField(
                label="Trim bases from 3' end of read 2",
                description="Remove bases from the 3' end of read 2 "
                "after adapter/quality trimming has been performed. "
                "This may remove some unwanted bias from the 3' end "
                "that is not directly related to adapter sequence or "
                "basecall quality.",
                required=False,
            )
Example #29
0
    class Input:
        """Input fields to process MarkDuplicates."""

        bam = DataField("alignment:bam", label="Alignment BAM file")
        skip = BooleanField(
            label="Skip MarkDuplicates step",
            description="MarkDuplicates step can be skipped.",
            default=False,
        )
        remove_duplicates = BooleanField(
            label="Remove duplicates",
            description="If true do not write duplicates to the output file "
            "instead of writing them with appropriate flags set.",
            default=False,
        )
        validation_stringency = StringField(
            label="Validation stringency",
            description="Validation stringency for all SAM files read by this "
            "program. Setting stringency to SILENT can improve "
            "performance when processing a BAM file in which "
            "variable-length data (read, qualities, tags) do not "
            "otherwise need to be decoded. Default is STRICT.",
            choices=[
                ("STRICT", "STRICT"),
                ("LENIENT", "LENIENT"),
                ("SILENT", "SILENT"),
            ],
            default="STRICT",
        )
        assume_sort_order = StringField(
            label="Assume sort order",
            description="If not null (default), assume that the input file "
            "has this order even if the header says otherwise."
            "Possible values are unsorted, queryname, coordinate "
            "and unknown.",
            choices=[
                ("", "as in BAM header (default)"),
                ("unsorted", "unsorted"),
                ("queryname", "queryname"),
                ("coordinate", "coordinate"),
                ("duplicate", "duplicate"),
                ("unknown", "unknown"),
            ],
            default="",
        )

        class BigWigOptions:
            """Options for calculating BigWig."""

            bigwig_binsize = IntegerField(
                label="BigWig bin size",
                description="Size of the bins, in bases, for the output of the "
                "bigwig/bedgraph file. Default is 50.",
                default=50,
            )
            bigwig_timeout = IntegerField(
                label="BigWig timeout",
                description=
                "Number of seconds before creation of BigWig timeouts. "
                "Default is after 480 seconds (8 minutes).",
                default=480,
            )

        bigwig_opts = GroupField(BigWigOptions, label="BigWig options")
Example #30
0
    class Input:
        """Input fields to process Deseq."""

        case = ListField(
            DataField("expression"),
            label="Case",
            description="Case samples (replicates)",
        )
        control = ListField(
            DataField("expression"),
            label="Control",
            description="Control samples (replicates)",
        )

        create_sets = BooleanField(
            label="Create gene sets",
            description="After calculating differential gene "
            "expressions create gene sets for up-regulated genes, "
            "down-regulated genes and all genes.",
            default=False,
        )
        logfc = FloatField(
            label="Log2 fold change threshold for gene sets",
            description="Genes above Log2FC are considered as "
            "up-regulated and genes below -Log2FC as down-regulated.",
            default=1.0,
            hidden="!create_sets",
        )
        fdr = FloatField(
            label="FDR threshold for gene sets",
            default=0.05,
            hidden="!create_sets",
        )

        class Options:
            """Options."""

            beta_prior = BooleanField(
                label="Beta prior",
                default=False,
                description="Whether or not to put a zero-mean normal prior "
                "on the non-intercept coefficients.",
            )

        class FilterOptions:
            """Filtering options."""

            count = BooleanField(
                label="Filter genes based on expression count",
                default=True,
            )
            min_count_sum = IntegerField(
                label="Minimum gene expression count summed over all samples",
                default=10,
                description="Filter genes in the expression matrix input. "
                "Remove genes where the expression count sum over all samples "
                "is below the threshold.",
                hidden="!filter_options.count",
            )
            cook = BooleanField(
                label="Filter genes based on Cook's distance",
                default=False,
            )
            cooks_cutoff = FloatField(
                label="Threshold on Cook's distance",
                required=False,
                description="If one or more samples have Cook's distance "
                "larger than the threshold set here, the p-value for the row "
                "is set to NA. If left empty, the default threshold of 0.99 "
                "quantile of the F(p, m-p) distribution is used, where p is "
                "the number of coefficients being fitted and m is the number "
                "of samples. This test excludes Cook's distance of samples "
                "belonging to experimental groups with only two samples.",
                hidden="!filter_options.cook",
            )
            independent = BooleanField(
                label="Apply independent gene filtering",
                default=False,
            )
            alpha = FloatField(
                label="Significance cut-off used for optimizing independent "
                "gene filtering",
                default=0.1,
                description="The value should be set to adjusted p-value "
                "cut-off (FDR).",
                hidden="!filter_options.independent",
            )

        options = GroupField(Options, label="Gene filtering options")
        filter_options = GroupField(
            FilterOptions, label="Differential expression analysis options")