def __init__(self, cmd="samtools", **kwargs): """Initialize the class.""" self.program_name = cmd self.parameters = [ _StaticArgument("phase"), _Argument(["input", "input_bam", "in_bam"], "Input file", filename=True, is_required=True), _Switch(["-A", "A"], "Drop reads with ambiguous phase"), _Option(["-b", "b"], "Prefix of BAM output", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Switch(["-F", "F"], "Do not attempt to fix chimeric reads"), _Option(["-k", "k"], "Maximum length for local phasing", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-q", "q"], """Minimum Phred-scaled LOD to call a heterozygote""", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-Q", "Q"], """Minimum base quality to be used in het calling""", equate=False, checker_function=lambda x: isinstance(x, int)) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd = "fneighbor", **kwargs): self.parameters = [ _Option(["-datafile", "datafile"], "dist file to use (phylip)", filename=True, is_required=True), _Option(["-matrixtype", "matrixtype"], "is martrix [S]quare pr [u]pper or [l]ower"), _Option(["-treetype", "treetype"], "nj or UPGMA tree (n/u)"), _Option(["-outgrno","outgrno" ], "taxon to use as OG"), _Option(["-jumble", "jumble"], "randommise input order (Y/n)"), _Option(["-seed", "seed"], "provide a random seed"), _Option(["-trout", "trout"], "write tree (Y/n)"), _Option(["-outtreefile", "outtreefile"], "filename for output tree"), _Option(["-progress", "progress"], "print progress (Y/n)"), _Option(["-treeprint", "treeprint"], "print tree (Y/n)")] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="samtools", **kwargs): self.program_name = cmd # options for version samtools 1.3.1 self.parameters = [ _StaticArgument("sort"), _Switch(["-n", "n"], """Sort by read names rather than by chromosomal coordinates"""), _Option(["-o", "o"], """(file) Write the final sorted output to FILE, rather than to standard output""", equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-O", "O"], """(FORMAT) Write the final output as sam, bam, or cram""", equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-T", "T"], """(PREFIX) Write temporary files to PREFIX.nnnn.bam, or if the specified PREFIX is an existing directory, to PREFIX/samtools.mmm.mmm.tmp.nnnn.bam, where mmm is unique to this invocation of the sort command""", equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-I", "I"], """(INT) Set the desired compression level for the final output file, ranging from 0 (uncompressed) or 1 (fastest but minimal compression) to 9 (best compression but slowest to write), similarly to gzip(1)'s compression level setting.""", equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-m", "m"], "Approximately the maximum required memory", equate=False, checker_function=lambda x: isinstance(x, int)), _Argument(["input"], "Input SAM/BAM/CRAM file", filename=True, is_required=True), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd='makeblastdb', **kwargs): assert cmd is not None self.parameters = [ # Output configuration options _Option(["-out", "out"], "Output file prefix for db.", filename=True, equate=False), _Option(["-in", "db"], "The sequence create db with.", filename=True, equate=False), # Should this be required? _Option(["-dbtype", "dbtype"], "Molecule type of target db (string, 'nucl' or 'prot').", equate=False) ] extra_parameters = [ # Core: _Switch(["-h", "h"], "Print USAGE and DESCRIPTION; ignore other arguments."), _Switch(["-help", "help"], "Print USAGE, DESCRIPTION and ARGUMENTS description; " "ignore other arguments."), _Switch(["-version", "version"], "Print version number; ignore other arguments.")] try: # Insert extra parameters - at the start just in case there # are any arguments which must come last: self.parameters = extra_parameters + self.parameters except AttributeError: # Should we raise an error? The subclass should have set this up! self.parameters = extra_parameters AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="primersearch", **kwargs): self.parameters = [ _Option( ["-seqall", "-sequences", "sequences", "seqall"], "Sequence to look for the primer pairs in.", is_required=True, ), # When this wrapper was written primersearch used -sequences # as the argument name. Since at least EMBOSS 5.0 (and # perhaps earlier) this has been -seqall instead. _Option( ["-infile", "-primers", "primers", "infile"], "File containing the primer pairs to search for.", filename=True, is_required=True, ), # When this wrapper was written primersearch used -primers # as the argument name. Since at least EMBOSS 5.0 (and # perhaps earlier) this has been -infile instead. _Option( ["-mismatchpercent", "mismatchpercent"], "Allowed percentage mismatch (any integer value, default 0).", is_required=True, ), _Option(["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)"), _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="samtools", **kwargs): self.program_name = cmd self.parameters = [ _StaticArgument("merge"), _Switch(["-n", "n"], """The input alignments are sorted by read names rather than by chromosomal coordinates"""), _Switch(["-r", "r"], """Attach an RG tag to each alignment. The tag value is inferred from file names"""), _Switch(["-u", "u"], "Uncompressed BAM output"), _Switch(["-1", "fast_bam"], """Use zlib compression level 1 to compress the output"""), _Switch(["-f", "f"], """Force to overwrite the output file if present"""), _Option(["-h", "h"], """Use the lines of FILE as '@' headers to be copied to out.bam""", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-R", "R"], "Merge files in the specified region indicated by STR", equate=False, checker_function=lambda x: isinstance(x, str)), _Argument(["output_bam", "out_bam", "out", "output"], "Output BAM file", filename=True, is_required=True), _ArgumentList(["input_bam", "in_bam", "input", "bam"], "Input BAM", filename=True, is_required=True), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="hmmscan", **kwargs): assert cmd is not None self.parameters = [ _Switch(["--cut_ga", "cut_ga"], "Gathering Cutoff"), _Switch(["--cut_nc", "cut_nc"], "Noise Cutoff"), _Switch(["--cut_tc", "cut_tc"], "Trusted Cutoff"), _Switch(["-h", "help"], "Print USAGE, DESCRIPTION and ARGUMENTS description; ignore other arguments."), _Switch(["--acc", "accession"], "prefer accessions over names in output"), _Option(["--cpu", "cpu"], "number of parallel CPU workers to use for multithreads"), _Option(["-o", "out"], "Output File", filename=True, equate=False ), _Argument(["hmm"], "HMM Library", checker_function=os.path.exists, filename=True, is_required=True), _Argument(["input"], "FASTA Query file", checker_function=os.path.exists, filename=True, is_required=True), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="seqmatchall", **kwargs): self.parameters = [ _Option(["-sequence", "sequence"], "Readable set of sequences", filename=True, is_required=True), _Option(["-wordsize", "wordsize"], "Word size (Integer 2 or more, default 4)"), _Option(["-aformat", "aformat"], "Display output in a different specified output format"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="bwa", **kwargs): self.program_name = cmd self.parameters = \ [ _StaticArgument("index"), _Option(["-a", "a", "algorithm"], """Algorithm for constructing BWT index. Available options are: - is: IS linear-time algorithm for constructing suffix array. It requires 5.37N memory where N is the size of the database. IS is moderately fast, but does not work with database larger than 2GB. IS is the default algorithm due to its simplicity. - bwtsw: Algorithm implemented in BWT-SW. This method works with the whole human genome, but it does not work with database smaller than 10MB and it is usually slower than IS.""", checker_function=lambda x: x in ["is", "bwtsw"], equate=False, is_required=True), _Option(["-p", "p", "prefix"], "Prefix of the output database [same as db filename]", equate=False, is_required=False), _Argument(["infile"], "Input file name", filename=True, is_required=True), _Switch(["-c", "c"], "Build color-space index. The input fasta should be in nucleotide space.") ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="garnier"): Application.AbstractCommandline.__init__(self) self.program_name = cmd self.parameters = [ _Option(["-sequence"], ["input", "file"], None, 1, "Sequence to predict SS of"), _Option(["-outfile"], ["output", "file"], "temp.garnier", 0, "Output file for the Statistics"), _Option(["-idc"], ["input"], None, 0, "Decision constants"), ]
def __init__(self, cmd="seqret", **kwargs): self.parameters = [ _Option(["-sequence", "sequence"], "Input sequence(s) filename", filename=True), _Option(["-outseq", "outseq"], "Output sequence file.", filename=True), _Option(["-sformat", "sformat"], "Input sequence(s) format (e.g. fasta, genbank)"), _Option(["-osformat", "osformat"], "Output sequence(s) format (e.g. fasta, genbank)"), ] _EmbossMinimalCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="pepstats"): Application.AbstractCommandline.__init__(self) self.program_name = cmd self.parameters = [ _Option(["-sequence"], ["input", "file"], None, 1, "First sequence to align"), _Option(["-outfile"], ["output", "file"], None, 1, "Output file for the Statistics"), _Option(["-aadata"], ["input", "file"], None, 0, "Matrix file"), ]
def __init__(self, cmd="fastacmd", **kwargs): self.parameters = \ [ _Option(["-d", "database"], ["input"], None, 1, "The database to retrieve from."), _Option(["-s", "search_string"], ["input"], None, 1, "The id to search for.") ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="fuzznuc", **kwargs): self.parameters = [ _Option(["-sequence", "sequence"], "Sequence database USA", is_required=True), _Option(["-pattern", "pattern"], "Search pattern, using standard IUPAC one-letter codes", is_required=True), _Option(["-mismatch", "mismatch"], "Number of mismatches", is_required=True), _Option(["-complement", "complement"], "Search complementary strand"), _Option(["-rformat", "rformat"], "Specify the report format to output in."), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="tranalign", **kwargs): self.parameters = [ _Option( ["-asequence", "asequence"], "Nucleotide sequences to be aligned.", filename=True, is_required=True ), _Option(["-bsequence", "bsequence"], "Protein sequence alignment", filename=True, is_required=True), _Option(["-outseq", "outseq"], "Output sequence file.", filename=True, is_required=True), _Option(["-table", "table"], "Code to use"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="iep", **kwargs): self.parameters = [ _Option(["-sequence","sequence"], ["input", "file"], None, 1, "Protein sequence(s) filename"), _Option(["-amino","amino"], ["input"], None, 0), _Option(["-lysinemodified","lysinemodified"], ["input"], None, 0), _Option(["-disulphides","disulphides"], ["input"], None, 0), _Option(["-notermini","notermini"], ["input"], None, 0), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="needle", **kwargs): self.parameters = [ _Option(["-asequence","asequence"], "First sequence to align", filename=True, is_required=True), _Option(["-bsequence","bsequence"], "Second sequence to align", filename=True, is_required=True), _Option(["-gapopen","gapopen"], "Gap open penalty", is_required=True), _Option(["-gapextend","gapextend"], "Gap extension penalty", is_required=True), _Option(["-datafile","datafile"], "Matrix file", filename=True), _Option(["-similarity","similarity"], "Display percent identity and similarity"), _Option(["-snucleotide","snucleotide"], "Sequences are nucleotide (boolean)"), _Option(["-sprotein","sprotein"], "Sequences are protein (boolean)"), _Option(["-aformat","aformat"], "Display output in a different specified output format")] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="tranalign", **kwargs): self.parameters = [ _Option(["-asequence","asequence"], ["input", "file"], None, 1, "Nucleotide sequences to be aligned."), _Option(["-bsequence","bsequence"], ["input", "file"], None, 1, "Protein sequence alignment"), _Option(["-outseq","outseq"], ["output", "file"], None, 1, "Output sequence file."), _Option(["-table","table"], ["input"], None, 0, "Code to use")] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="bwa samse", **kwargs): self.program_name = cmd self.parameters = \ [ _Argument(["reference"],"Reference file name", filename=True, is_required=True), _Argument(["sai_file"],"Sai file name", filename=True, is_required=True), _Argument(["read_file"],"Read file name", filename=True, is_required=True), _Option(["-n","n"],"Maximum number of alignments to output in the XA tag for reads paired properly. If a read has more than INT hits, the XA tag will not be written. [3]",filename=False, equate=False,checker_function=lambda x : isinstance(x,int)), _Option(["-r","r"],"Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]",filename=False, equate=False,checker_function=lambda x : isinstance(x,basestring)) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="primersearch", **kwargs): self.parameters = \ [_Option(["-sequences","sequences"], ["input"], None, 1, "Sequence to look for the primer pairs in."), _Option(["-primers","primers"], ["input", "file"], None, 1, "File containing the primer pairs to search for."), #Including -out and out for backwards compatibility only! #_Option(["-outfile","-out","out","outfile"], ["output", "file"], None, 0, # "Name of the output file."), _Option(["-mismatchpercent","mismatchpercent"], ["input"], None, 1, "Allowed percentage mismatch.")] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, fastacmd = "fastacmd"): Application.AbstractCommandline.__init__(self) self.program_name = fastacmd self.parameters = \ [ _Option(["-d", "database"], ["input"], None, 1, "The database to retrieve from."), _Option(["-s", "search_string"], ["input"], None, 1, "The id to search for.") ]
def __init__(self, cmd="t_coffee", **kwargs): self.parameters = [ _Option( ["-output", "output"], """Specify the output type. One (or more separated by a comma) of: 'clustalw_aln', 'clustalw', 'gcg', 'msf_aln', 'pir_aln', 'fasta_aln', 'phylip', 'pir_seq', 'fasta_seq' Note that of these Biopython's AlignIO module will only read clustalw, pir, and fasta. """, # TODO - Can we read the PHYLIP output? equate=False, ), _Option(["-infile", "infile"], "Specify the input file.", filename=True, is_required=True, equate=False), # Indicates the name of the alignment output by t_coffee. If the # default is used, the alignment is named <your sequences>.aln _Option( ["-outfile", "outfile"], "Specify the output file. Default: <your sequences>.aln", filename=True, equate=False, ), _Switch(["-convert", "convert"], "Specify you want to perform a file conversion"), _Option( ["-type", "type"], "Specify the type of sequence being aligned", checker_function=lambda x: x in self.SEQ_TYPES, equate=False, ), _Option( ["-outorder", "outorder"], "Specify the order of sequence to output" "Either 'input', 'aligned' or <filename> of " "Fasta file with sequence order", equate=False, ), _Option( ["-matrix", "matrix"], "Specify the filename of the substitution matrix to use." "Default: blosum62mt", equate=False, ), _Option( ["-gapopen", "gapopen"], "Indicates the penalty applied for opening a gap " "(negative integer)", checker_function=lambda x: isinstance(x, int), equate=False, ), _Option( ["-gapext", "gapext"], "Indicates the penalty applied for extending a " "gap. (negative integer)", checker_function=lambda x: isinstance(x, int), equate=False, ), _Switch(["-quiet", "quiet"], "Turn off log output"), _Option(["-mode", "mode"], "Specifies a special mode: genome, quickaln, dali, 3dcoffee", equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="cmsearch", **kwargs): self.parameters = [ _Option(['--forecast','forecast'],'Forecast the time of execution, not searching', equate=False,is_required=False), _Switch(['--ga','usega'],'Search above the defined cutoff'), _Option(['-Z','size'],'Database size in MB',equate=False), _Switch(['-g','glocal'],'Performs a glocal alignment search'), _Switch(['--noalign','noalign'],'Prints just the start, stop, score'), _Argument(['modelname'],'name of model',filename=True,is_required=True), _Argument(['database'],'Database file', filename=True,is_required=True) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd = "tranalign"): Application.AbstractCommandline.__init__(self) self.program_name = cmd self.parameters = [ _Option(["-asequence"], ["input", "file"], None, 1, "Nucleotide sequences to be aligned."), _Option(["-bsequence"], ["input", "file"], None, 1, "Protein sequence alignment"), _Option(["-outseq"], ["output", "file"], None, 1, "Output sequence file."), _Option(["-table"], ["input"], None, 0, "Code to use")]
def __init__(self, cmd = "primersearch"): Application.AbstractCommandline.__init__(self) self.program_name = cmd self.parameters = \ [_Option(["-sequences"], ["input"], None, 1, "Sequence to look for the primer pairs in."), _Option(["-primers"], ["input", "file"], None, 1, "File containing the primer pairs to search for."), _Option(["-out"], ["output", "file"], None, 1, "Name of the output file."), _Option(["-mismatchpercent"], ["input"], None, 1, "Allowed percentage mismatch.")]
def __init__(self, cmd = "AlignACE"): Application.AbstractCommandline.__init__(self) self.program_name = cmd self.parameters = \ [ _Option(["-i","input","Sequence File"],["input"],lambda x : x.__class__== str,1, "Input Sequence file in FASTA format."), _Option(["-numcols","numcols","number of columns to align"],["input"],lambda x : x.__class__== int,0, "Number of columns to align"), _Option(["-expect","expect","number of sites expected in model "],["input"],lambda x : x.__class__== int,0, "number of sites expected in model "), _Option(["-gcback","gcback","background fractional GC content of input sequence"],["input"],lambda x : x.__class__== float,0, "background fractional GC content of input sequence"), _Option(["-minpass","minpass","minimum number of non-improved passes in phase 1"],["input"],lambda x : x.__class__== int,0, "minimum number of non-improved passes in phase 1"), _Option(["-seed","seed","set seed for random number generator (time)"],["input"],lambda x : x.__class__== int,0, "set seed for random number generator (time)"), _Option(["-undersample","undersample","possible sites / (expect * numcols * seedings)"],["input"],lambda x : x.__class__== int,0, "possible sites / (expect * numcols * seedings)"), _Option(["-oversample","oversample","1/undersample"],["input"],lambda x : x.__class__== int,0, "1/undersample"), ]
def __init__(self, cmd="AlignACE", **kwargs): self.parameters = \ [ _Option(["-i","input"],["input"],lambda x : isinstance(x, str),1, "Input Sequence file in FASTA format."), _Option(["-numcols","numcols"],["input"],lambda x : isinstance(x, int),0, "Number of columns to align"), _Option(["-expect","expect"],["input"],lambda x : isinstance(x, int),0, "number of sites expected in model "), _Option(["-gcback","gcback"],["input"],lambda x : isinstance(x, float),0, "background fractional GC content of input sequence"), _Option(["-minpass","minpass"],["input"],lambda x : isinstance(x, int),0, "minimum number of non-improved passes in phase 1"), _Option(["-seed","seed"],["input"],lambda x : isinstance(x, int),0, "set seed for random number generator (time)"), _Option(["-undersample","undersample"],["input"],lambda x : isinstance(x, int),0, "possible sites / (expect * numcols * seedings)"), _Option(["-oversample","oversample"],["input"],lambda x : isinstance(x, int),0, "1/undersample"), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="RNAalifold", **kwargs): self.parameters = [ _Option(['','filename'],'The MSA file',equate=False,filename=True), _Option(['-cv','covariance'],'Set the weight of the covariance term in the energy function to factor', equate=False), _Option(['-nc','non-compatible'],'Set the penalty for non-compatible sequences in the covariance term of the energy function to factor', equate=False), _Switch(['-mis','mis'],'Output "most informative sequence" instead of simple consensus'), _Switch(['-E','endgaps'],'Score pairs with endgaps same as gap-gap pairs'), _Switch(['-p','partition'],'Calculate the partition function and base pairing probability matrix'), _Switch(['-color','color'],'Produce a colored version of the consensus strcture plot "alirna.ps"'), _Switch(['-aln','alignment'],'Produce a colored and structure annotated alignment in PostScript format in the file "aln.ps" in the current directory') ] _ViennaMinimalCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="samtools", **kwargs): self.program_name = cmd self.parameters = [ _StaticArgument("cat"), _Option(["-h", "h"], "Header SAM file", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-o", "o"], "Output SAM file", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _ArgumentList(["input", "input_bam", "bams"], "Input BAM files", filename=True, is_required=True) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="cmcalibrate", **kwargs): self.parameters = [ _Option(['--forecast','forecast'],'Forecast the time of execution, not calibrating', equate=False,is_required=False), _Argument(['modelname'],'name of model',filename=True,is_required=True) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="est2genome", **kwargs): """Initialize the class.""" self.parameters = [ _Option(["-est", "est"], "EST sequence(s)", is_required=True), _Option(["-genome", "genome"], "Genomic sequence", is_required=True), _Option(["-match", "match"], "Score for matching two bases"), _Option(["-mismatch", "mismatch"], "Cost for mismatching two bases"), _Option( ["-gappenalty", "gappenalty"], "Cost for deleting a single base in either sequence, " "excluding introns", ), _Option( ["-intronpenalty", "intronpenalty"], "Cost for an intron, independent of length.", ), _Option( ["-splicepenalty", "splicepenalty"], "Cost for an intron, independent of length " "and starting/ending on donor-acceptor sites", ), _Option( ["-minscore", "minscore"], "Exclude alignments with scores below this threshold score.", ), _Option( ["-reverse", "reverse"], "Reverse the orientation of the EST sequence" ), _Option(["-splice", "splice"], "Use donor and acceptor splice sites."), _Option( ["-mode", "mode"], "This determines the comparion mode. 'both', 'forward', or 'reverse'", ), _Option( ["-best", "best"], "You can print out all comparisons instead of just the best", ), _Option(["-space", "space"], "for linear-space recursion."), _Option(["-shuffle", "shuffle"], "Shuffle"), _Option(["-seed", "seed"], "Random number seed"), _Option(["-align", "align"], "Show the alignment."), _Option(["-width", "width"], "Alignment width"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="samtools", **kwargs): """Initialize the class.""" self.program_name = cmd self.parameters = [ _StaticArgument("mpileup"), _Switch(["-E", "E"], """Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt specificity a little bit"""), _Switch(["-B", "B"], """Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments"""), _Switch(["-g", "g"], """Compute genotype likelihoods and output them in the binary call format (BCF)"""), _Switch(["-u", "u"], """Similar to -g except that the output is uncompressed BCF, which is preferred for piping"""), _Option(["-C", "C"], """Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50""", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-r", "r"], "Only generate pileup in region STR", equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-f", "f"], """The faidx-indexed reference file in the FASTA format. The file can be optionally compressed by razip""", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-l", "l"], """BED or position list file containing a list of regions or sites where pileup or BCF should be generated""", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-M", "M"], "Cap Mapping Quality at M", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-q", "q"], "Minimum mapping quality for an alignment to be used", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-Q", "Q"], "Minimum base quality for a base to be considered", equate=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-6", "illumina_13"], "Assume the quality is in the Illumina 1.3+ encoding"), _Switch(["-A", "A"], "Do not skip anomalous read pairs in variant calling."), _Option(["-b", "b"], "List of input BAM files, one file per line", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-d", "d"], "At a position, read maximally INT reads per input BAM", equate=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-D", "D"], "Output per-sample read depth"), _Switch(["-S", "S"], """Output per-sample Phred-scaled strand bias P-value"""), _Option(["-e", "e"], """Phred-scaled gap extension sequencing error probability. Reducing INT leads to longer indels""", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-h", "h"], """Coefficient for modeling homopolymer errors. Given an l-long homopolymer run, the sequencing error of an indel of size s is modeled as INT*s/l""", equate=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-I", "I"], "Do not perform INDEL calling"), _Option(["-L", "L"], """Skip INDEL calling if the average per-sample depth is above INT""", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-o", "o"], """Phred-scaled gap open sequencing error probability. Reducing INT leads to more indel calls.""", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-p", "p"], """Comma delimited list of platforms (determined by @RG-PL) from which indel candidates are obtained. It is recommended to collect indel candidates from sequencing technologies that have low indel error rate such as ILLUMINA""", equate=False, checker_function=lambda x: isinstance(x, str)), _ArgumentList(["input_file"], "Input File for generating mpileup", filename=True, is_required=True), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, fastsimcoal_dir=None, cmd='fastsimcoal', **kwargs): self.parameters = [ _Option(["-i", "--ifile", "parfile"], "Name of the parameter file", filename=True, equate=False, is_required=False, checker_function=lambda x: isinstance(x, str)), _Option(["-n", "--numsims", "numsims"], "Number of simulations to perform", filename=False, equate=False, is_required=True, checker_function=lambda x: isinstance(x, int)), _Option(["-t", "--tfile", "tfile"], "Name of template parameter file", filename=True, equate=False, is_required=False, checker_function=lambda x: isinstance(x, str)), _Option(["-f", "--dfile", "dfile"], "Name of parameter definition file", filename=True, equate=False, is_required=False, checker_function=lambda x: isinstance(x, str)), _Option(["-F", "--dFile", "dFile"], """Same as -f but only uses simple parameters defined in the template file. Complex params are recomputed""", filename=True, equate=False, is_required=False, checker_function=lambda x: isinstance(x, str)), _Option(["-e", "--efile", "efile"], """Parameter prior definition file. Parameters drawn from specified distributions are substituted into template file.""", filename=True, equate=False, is_required=False, checker_function=lambda x: isinstance(x, str)), _Option(["-E", "--numest", "numest"], """Number of estimations from parameter priors. Listed parameter values are substituted in template file.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-g", "--genotypic", "genotypic"], "Generates Arlequin projects with genotypic data"), _Switch(["-p", "--phased", "phased"], "Specifies that phase is known in Arlequin output"), _Option(["-s", "--dnatosnp", "dnatosnp"], """"Output DNA as SNP data (0: ancestral, 1: derived and specify maximum no. SNPs to output.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Switch([ "-S", "--allsites", "allsites" ], """Output the whole DNA sequence, including monomorphic sites""" ), _Switch(["-I", "--inf", "inf"], """Generates DNA mutations according to an infinite sites (IS) mutation model."""), _Switch(["-d", "--dsfs", "dsfs"], "Computes derived site frequency spectrum"), _Switch(["-m", "--msfs", "msfs"], "Computes minor site frequency spectrum"), _Option(["-o", "--oname", "oname"], "Generic name for observed SFS files", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, str)), _Switch(["-H", "--header", "header"], "Generates header in site frequency spectrum files."), _Switch(["-q", "--quiet", "quiet"], "Minimal messages output to console"), _Switch(["-T", "--tree", "tree"], "Output coalescent tree in nexus format."), _Option(["-k", "--keep", "keep"], """Number of simulated polymorphic sites kept in memory. If the simulated no. is larger, then temporary files are created.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Option( ["--seed", "seed"], "Seed for the random number generator (positive int <=1E6)", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-x", "--noarloutput", "noarloutput"], "Does not generate Arlequin output"), _Switch(["-D", "--dadioutput", "dadioutput"], "Output SFS in dadi format"), _Option(["-M", "--maxlhood", "maxlhood"], """Perform parameter estimation by max lhood from SFS, and define stop criterion as min., rel., diff. in parameter values between iterations""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, float)), _Option(["-N", "--maxnumsims", "maxnumsims"], """Maximum number of simulations to perform during likelihood maximization.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Option(["-l", "--minnumloops", "minnumloops"], """Minimum number of iteration loops to perform during likelihood maximization.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Option(["-L", "--maxnumloops", "maxnumloops"], """Maximum number of iterations to perform during likelihood maximization""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Option(["-C", "--minSFSCount", "minSFSCount"], """Minimum observed SFS entry count taken into account in likelihood computation""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-0", "--removeZeroSFS", "removeZeroSFS"], """Do not take into account monomorphic sites for SFS likelihood computation."""), _Option(["-a", "--ascDeme", "ascDeme"], """This is the deme id where ascertainment is performed when simulating SNPs.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Option(["-A", "--ascSize", "ascSize"], """Number of ascertained chromosomes used to define SNPs in a given deme.""", filename=False, equate=False, is_required=False, checker_function=lambda x: isinstance(x, int)), _Switch(["-u", "--multiSFS", "multiSFS"], "Generate or use multidimensional SFS") ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd='phyml', **kwargs): self.parameters = [ _Option(['-i', '--input', 'input'], """Name of the nucleotide or amino-acid sequence file in PHYLIP format.""", filename=True, is_required=True, equate=False, ), _Option(['-d', '--datatype', 'datatype'], """Data type is 'nt' for nucleotide (default) and 'aa' for amino-acid sequences.""", checker_function=lambda x: x in ('nt', 'aa'), equate=False, ), _Switch(['-q', '--sequential', 'sequential'], "Changes interleaved format (default) to sequential format." ), _Option(['-n', '--multiple', 'multiple'], "Number of data sets to analyse (integer).", checker_function=(lambda x: isinstance(x, int) or x.isdigit()), equate=False, ), _Switch(['-p', '--pars', 'pars'], """Use a minimum parsimony starting tree. This option is taken into account when the '-u' option is absent and when tree topology modifications are to be done. """ ), _Option(['-b', '--bootstrap', 'bootstrap'], """Number of bootstrap replicates, if value is > 0. Otherwise: 0: neither approximate likelihood ratio test nor bootstrap values are computed. -1: approximate likelihood ratio test returning aLRT statistics. -2: approximate likelihood ratio test returning Chi2-based parametric branch supports. -4: SH-like branch supports alone. """, equate=False, ), _Option(['-m', '--model', 'model'], """Substitution model name. Nucleotide-based models: HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR | custom For the custom option, a string of six digits identifies the model. For instance, 000000 corresponds to F81 (or JC69, provided the distribution of nucleotide frequencies is uniform). 012345 corresponds to GTR. This option can be used for encoding any model that is a nested within GTR. Amino-acid based models: LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV | CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb | custom """, checker_function=(lambda x: x in ( # Nucleotide models: 'HKY85', 'JC69', 'K80', 'F81', 'F84', 'TN93', 'GTR', # Amino acid models: 'LG', 'WAG', 'JTT', 'MtREV', 'Dayhoff', 'DCMut', 'RtREV', 'CpREV', 'VT', 'Blosum62', 'MtMam', 'MtArt', 'HIVw', 'HIVb') or isinstance(x, int)), equate=False, ), _Option(['-f', 'frequencies'], """Character frequencies. -f e, m, or "fA fC fG fT" e : Empirical frequencies, determined as follows : - Nucleotide sequences: (Empirical) the equilibrium base frequencies are estimated by counting the occurrence of the different bases in the alignment. - Amino-acid sequences: (Empirical) the equilibrium amino-acid frequencies are estimated by counting the occurrence of the different amino-acids in the alignment. m : ML/model-based frequencies, determined as follows : - Nucleotide sequences: (ML) the equilibrium base frequencies are estimated using maximum likelihood - Amino-acid sequences: (Model) the equilibrium amino-acid frequencies are estimated using the frequencies defined by the substitution model. "fA fC fG fT" : only valid for nucleotide-based models. fA, fC, fG and fT are floating-point numbers that correspond to the frequencies of A, C, G and T, respectively. """, filename=True, # ensure ".25 .25 .25 .25" stays quoted equate=False, ), _Option(['-t', '--ts/tv', 'ts_tv_ratio'], """Transition/transversion ratio. (DNA sequences only.) Can be a fixed positive value (ex:4.0) or e to get the maximum-likelihood estimate. """, equate=False, ), _Option(['-v', '--pinv', 'prop_invar'], """Proportion of invariable sites. Can be a fixed value in the range [0,1], or 'e' to get the maximum-likelihood estimate. """, equate=False, ), _Option(['-c', '--nclasses', 'nclasses'], """Number of relative substitution rate categories. Default 1. Must be a positive integer. """, equate=False, ), _Option(['-a', '--alpha', 'alpha'], """Distribution of the gamma distribution shape parameter. Can be a fixed positive value, or 'e' to get the maximum-likelihood estimate. """, equate=False, ), _Option(['-s', '--search', 'search'], """Tree topology search operation option. Can be one of: NNI : default, fast SPR : a bit slower than NNI BEST : best of NNI and SPR search """, checker_function=lambda x: x in ('NNI', 'SPR', 'BEST'), equate=False, ), # alt name: user_tree_file _Option(['-u', '--inputtree', 'input_tree'], "Starting tree filename. The tree must be in Newick format.", filename=True, equate=False, ), _Option(['-o', 'optimize'], """Specific parameter optimisation. tlr : tree topology (t), branch length (l) and rate parameters (r) are optimised. tl : tree topology and branch length are optimised. lr : branch length and rate parameters are optimised. l : branch length are optimised. r : rate parameters are optimised. n : no parameter is optimised. """, equate=False, ), _Switch(['--rand_start', 'rand_start'], """Sets the initial tree to random. Only valid if SPR searches are to be performed. """, ), _Option(['--n_rand_starts', 'n_rand_starts'], """Number of initial random trees to be used. Only valid if SPR searches are to be performed. """, equate=False, ), _Option(['--r_seed', 'r_seed'], """Seed used to initiate the random number generator. Must be an integer. """, equate=False, ), _Switch(['--print_site_lnl', 'print_site_lnl'], "Print the likelihood for each site in file *_phyml_lk.txt." ), _Switch(['--print_trace', 'print_trace'], """Print each phylogeny explored during the tree search process in file *_phyml_trace.txt.""" ), _Option(['--run_id', 'run_id'], """Append the given string at the end of each PhyML output file. This option may be useful when running simulations involving PhyML. """, checker_function=lambda x: isinstance(x, basestring), equate=False, ), # XXX should this always be set to True? _Switch(['--quiet', 'quiet'], "No interactive questions (for running in batch mode)." ), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="mafft", **kwargs): """Initialize the class.""" BLOSUM_MATRICES = ["30", "45", "62", "80"] self.parameters = [ # **** Algorithm **** # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) _Switch(["--auto", "auto"], "Automatically select strategy. Default off."), # Distance is calculated based on the number of shared 6mers. Default: on _Switch( ["--6merpair", "6merpair", "sixmerpair"], "Distance is calculated based on the number of shared " "6mers. Default: on", ), # All pairwise alignments are computed with the Needleman-Wunsch # algorithm. More accurate but slower than --6merpair. Suitable for a # set of globally alignable sequences. Applicable to up to ~200 # sequences. A combination with --maxiterate 1000 is recommended (G- # INS-i). Default: off (6mer distance is used) _Switch( ["--globalpair", "globalpair"], "All pairwise alignments are computed with the " "Needleman-Wunsch algorithm. Default: off", ), # All pairwise alignments are computed with the Smith-Waterman # algorithm. More accurate but slower than --6merpair. Suitable for a # set of locally alignable sequences. Applicable to up to ~200 # sequences. A combination with --maxiterate 1000 is recommended (L- # INS-i). Default: off (6mer distance is used) _Switch( ["--localpair", "localpair"], "All pairwise alignments are computed with the " "Smith-Waterman algorithm. Default: off", ), # All pairwise alignments are computed with a local algorithm with # the generalized affine gap cost (Altschul 1998). More accurate but # slower than --6merpair. Suitable when large internal gaps are # expected. Applicable to up to ~200 sequences. A combination with -- # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer # distance is used) _Switch( ["--genafpair", "genafpair"], "All pairwise alignments are computed with a local " "algorithm with the generalized affine gap cost " "(Altschul 1998). Default: off", ), # All pairwise alignments are computed with FASTA (Pearson and Lipman # 1988). FASTA is required. Default: off (6mer distance is used) _Switch( ["--fastapair", "fastapair"], "All pairwise alignments are computed with FASTA " "(Pearson and Lipman 1988). Default: off", ), # Weighting factor for the consistency term calculated from pairwise # alignments. Valid when either of --blobalpair, --localpair, -- # genafpair, --fastapair or --blastpair is selected. Default: 2.7 _Option( ["--weighti", "weighti"], "Weighting factor for the consistency term calculated " "from pairwise alignments. Default: 2.7", checker_function=lambda x: isinstance(x, float), equate=False, ), # Guide tree is built number times in the progressive stage. Valid # with 6mer distance. Default: 2 _Option( ["--retree", "retree"], "Guide tree is built number times in the progressive " "stage. Valid with 6mer distance. Default: 2", checker_function=lambda x: isinstance(x, int), equate=False, ), # Number cycles of iterative refinement are performed. Default: 0 _Option( ["--maxiterate", "maxiterate"], "Number cycles of iterative refinement are performed. Default: 0", checker_function=lambda x: isinstance(x, int), equate=False, ), # Number of threads to use. Default: 1 _Option( ["--thread", "thread"], "Number of threads to use. Default: 1", checker_function=lambda x: isinstance(x, int), equate=False, ), # Use FFT approximation in group-to-group alignment. Default: on _Switch( ["--fft", "fft"], "Use FFT approximation in group-to-group alignment. Default: on", ), # Do not use FFT approximation in group-to-group alignment. Default: # off _Switch( ["--nofft", "nofft"], "Do not use FFT approximation in group-to-group " "alignment. Default: off", ), # Alignment score is not checked in the iterative refinement stage. # Default: off (score is checked) _Switch( ["--noscore", "noscore"], "Alignment score is not checked in the iterative " "refinement stage. Default: off (score is checked)", ), # Use the Myers-Miller (1988) algorithm. Default: automatically # turned on when the alignment length exceeds 10,000 (aa/nt). _Switch( ["--memsave", "memsave"], "Use the Myers-Miller (1988) algorithm. Default: " "automatically turned on when the alignment length " "exceeds 10,000 (aa/nt).", ), # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with # the 6mer distance. Recommended for a large number (> ~10,000) of # sequences are input. Default: off _Switch( ["--parttree", "parttree"], "Use a fast tree-building method with the 6mer " "distance. Default: off", ), # The PartTree algorithm is used with distances based on DP. Slightly # more accurate and slower than --parttree. Recommended for a large # number (> ~10,000) of sequences are input. Default: off _Switch( ["--dpparttree", "dpparttree"], "The PartTree algorithm is used with distances " "based on DP. Default: off", ), # The PartTree algorithm is used with distances based on FASTA. # Slightly more accurate and slower than --parttree. Recommended for # a large number (> ~10,000) of sequences are input. FASTA is # required. Default: off _Switch( ["--fastaparttree", "fastaparttree"], "The PartTree algorithm is used with distances based " "on FASTA. Default: off", ), # The number of partitions in the PartTree algorithm. Default: 50 _Option( ["--partsize", "partsize"], "The number of partitions in the PartTree algorithm. Default: 50", checker_function=lambda x: isinstance(x, int), equate=False, ), # Do not make alignment larger than number sequences. Valid only with # the --*parttree options. Default: the number of input sequences _Switch( ["--groupsize", "groupsize"], "Do not make alignment larger than number sequences. " "Default: the number of input sequences", ), # Adjust direction according to the first sequence # Mafft V6 beta function _Switch( ["--adjustdirection", "adjustdirection"], "Adjust direction according to the first sequence. Default off.", ), # Adjust direction according to the first sequence # for highly diverged data; very slow # Mafft V6 beta function _Switch( ["--adjustdirectionaccurately", "adjustdirectionaccurately"], "Adjust direction according to the first sequence," "for highly diverged data; very slow" "Default off.", ), # **** Parameter **** # Gap opening penalty at group-to-group alignment. Default: 1.53 _Option( ["--op", "op"], "Gap opening penalty at group-to-group alignment. Default: 1.53", checker_function=lambda x: isinstance(x, float), equate=False, ), # Offset value, which works like gap extension penalty, for group-to- # group alignment. Default: 0.123 _Option( ["--ep", "ep"], "Offset value, which works like gap extension penalty, " "for group-to- group alignment. Default: 0.123", checker_function=lambda x: isinstance(x, float), equate=False, ), # Gap opening penalty at local pairwise alignment. Valid when the -- # localpair or --genafpair option is selected. Default: -2.00 _Option( ["--lop", "lop"], "Gap opening penalty at local pairwise alignment. Default: 0.123", checker_function=lambda x: isinstance(x, float), equate=False, ), # Offset value at local pairwise alignment. Valid when the -- # localpair or --genafpair option is selected. Default: 0.1 _Option( ["--lep", "lep"], "Offset value at local pairwise alignment. Default: 0.1", checker_function=lambda x: isinstance(x, float), equate=False, ), # Gap extension penalty at local pairwise alignment. Valid when the - # -localpair or --genafpair option is selected. Default: -0.1 _Option( ["--lexp", "lexp"], "Gap extension penalty at local pairwise alignment. Default: -0.1", checker_function=lambda x: isinstance(x, float), equate=False, ), # Gap opening penalty to skip the alignment. Valid when the -- # genafpair option is selected. Default: -6.00 _Option( ["--LOP", "LOP"], "Gap opening penalty to skip the alignment. Default: -6.00", checker_function=lambda x: isinstance(x, float), equate=False, ), # Gap extension penalty to skip the alignment. Valid when the -- # genafpair option is selected. Default: 0.00 _Option( ["--LEXP", "LEXP"], "Gap extension penalty to skip the alignment. Default: 0.00", checker_function=lambda x: isinstance(x, float), equate=False, ), # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. # number=30, 45, 62 or 80. Default: 62 _Option( ["--bl", "bl"], "BLOSUM number matrix is used. Default: 62", checker_function=lambda x: x in BLOSUM_MATRICES, equate=False, ), # JTT PAM number (Jones et al. 1992) matrix is used. number>0. # Default: BLOSUM62 _Option( ["--jtt", "jtt"], "JTT PAM number (Jones et al. 1992) matrix is used. " "number>0. Default: BLOSUM62", equate=False, ), # Transmembrane PAM number (Jones et al. 1994) matrix is used. # number>0. Default: BLOSUM62 _Option( ["--tm", "tm"], "Transmembrane PAM number (Jones et al. 1994) " "matrix is used. number>0. Default: BLOSUM62", filename=True, # to ensure spaced inputs are quoted equate=False, ), # Use a user-defined AA scoring matrix. The format of matrixfile is # the same to that of BLAST. Ignored when nucleotide sequences are # input. Default: BLOSUM62 _Option( ["--aamatrix", "aamatrix"], "Use a user-defined AA scoring matrix. Default: BLOSUM62", filename=True, # to ensure spaced inputs are quoted equate=False, ), # Incorporate the AA/nuc composition information into the scoring # matrix. Default: off _Switch( ["--fmodel", "fmodel"], "Incorporate the AA/nuc composition information into " "the scoring matrix (True) or not (False, default)", ), # **** Output **** # Name length for CLUSTAL and PHYLIP format output _Option( ["--namelength", "namelength"], """Name length in CLUSTAL and PHYLIP output. MAFFT v6.847 (2011) added --namelength for use with the --clustalout option for CLUSTAL output. MAFFT v7.024 (2013) added support for this with the --phylipout option for PHYLIP output (default 10). """, checker_function=lambda x: isinstance(x, int), equate=False, ), # Output format: clustal format. Default: off (fasta format) _Switch( ["--clustalout", "clustalout"], "Output format: clustal (True) or fasta (False, default)", ), # Output format: phylip format. # Added in beta with v6.847, fixed in v6.850 (2011) _Switch( ["--phylipout", "phylipout"], "Output format: phylip (True), or fasta (False, default)", ), # Output order: same as input. Default: on _Switch( ["--inputorder", "inputorder"], "Output order: same as input (True, default) or alignment " "based (False)", ), # Output order: aligned. Default: off (inputorder) _Switch( ["--reorder", "reorder"], "Output order: aligned (True) or in input order (False, default)", ), # Guide tree is output to the input.tree file. Default: off _Switch( ["--treeout", "treeout"], "Guide tree is output to the input.tree file (True) or " "not (False, default)", ), # Do not report progress. Default: off _Switch( ["--quiet", "quiet"], "Do not report progress (True) or not (False, default).", ), # **** Input **** # Assume the sequences are nucleotide. Default: auto _Switch( ["--nuc", "nuc"], "Assume the sequences are nucleotide (True/False). Default: auto", ), # Assume the sequences are amino acid. Default: auto _Switch( ["--amino", "amino"], "Assume the sequences are amino acid (True/False). Default: auto", ), # MAFFT has multiple --seed commands where the unaligned input is # aligned to the seed alignment. There can be multiple seeds in the # form: "mafft --seed align1 --seed align2 [etc] input" # Effectively for n number of seed alignments. # TODO - Can we use class _ArgumentList here? _Option( ["--seed", "seed"], "Seed alignments given in alignment_n (fasta format) " "are aligned with sequences in input.", filename=True, equate=False, ), _Option( ["--add", "add"], "(Unofficial interface for --add) New sequences to add " "to existing alignment.", filename=True, equate=False, ), # The input (must be FASTA format) _Argument(["input"], "Input file name", filename=True, is_required=True), # mafft-profile takes a second alignment input as an argument: # mafft-profile align1 align2 _Argument( ["input1"], "Second input file name for the mafft-profile command", filename=True, ), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="phyml", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-i", "--input", "input"], "PHYLIP format input nucleotide or amino-acid sequence filenam.", filename=True, is_required=True, equate=False, ), _Option( ["-d", "--datatype", "datatype"], "Datatype 'nt' for nucleotide (default) or 'aa' for amino-acids.", checker_function=lambda x: x in ("nt", "aa"), equate=False, ), _Switch( ["-q", "--sequential", "sequential"], "Changes interleaved format (default) to sequential format.", ), _Option( ["-n", "--multiple", "multiple"], "Number of data sets to analyse (integer).", checker_function=(lambda x: isinstance(x, int) or x.isdigit()), equate=False, ), _Switch( ["-p", "--pars", "pars"], """Use a minimum parsimony starting tree. This option is taken into account when the '-u' option is absent and when tree topology modifications are to be done. """, ), _Option( ["-b", "--bootstrap", "bootstrap"], r"""Number of bootstrap replicates, if value is > 0. Otherwise: 0: neither approximate likelihood ratio test nor bootstrap values are computed. -1: approximate likelihood ratio test returning aLRT statistics. -2: approximate likelihood ratio test returning Chi2-based parametric branch supports. -4: SH-like branch supports alone. """, equate=False, ), _Option( ["-m", "--model", "model"], """Substitution model name. Nucleotide-based models: HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR | custom For the custom option, a string of six digits identifies the model. For instance, 000000 corresponds to F81 (or JC69, provided the distribution of nucleotide frequencies is uniform). 012345 corresponds to GTR. This option can be used for encoding any model that is a nested within GTR. Amino-acid based models: LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV | CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb | custom """, checker_function=( lambda x: x in ( # Nucleotide models: "HKY85", "JC69", "K80", "F81", "F84", "TN93", "GTR", # Amino acid models: "LG", "WAG", "JTT", "MtREV", "Dayhoff", "DCMut", "RtREV", "CpREV", "VT", "Blosum62", "MtMam", "MtArt", "HIVw", "HIVb", ) or isinstance(x, int)), equate=False, ), _Option( ["-f", "frequencies"], """Character frequencies. -f e, m, or "fA fC fG fT" e : Empirical frequencies, determined as follows : - Nucleotide sequences: (Empirical) the equilibrium base frequencies are estimated by counting the occurrence of the different bases in the alignment. - Amino-acid sequences: (Empirical) the equilibrium amino-acid frequencies are estimated by counting the occurrence of the different amino-acids in the alignment. m : ML/model-based frequencies, determined as follows : - Nucleotide sequences: (ML) the equilibrium base frequencies are estimated using maximum likelihood - Amino-acid sequences: (Model) the equilibrium amino-acid frequencies are estimated using the frequencies defined by the substitution model. "fA fC fG fT" : only valid for nucleotide-based models. fA, fC, fG and fT are floating-point numbers that correspond to the frequencies of A, C, G and T, respectively. """, filename=True, # ensure ".25 .25 .25 .25" stays quoted equate=False, ), _Option( ["-t", "--ts/tv", "ts_tv_ratio"], """Transition/transversion ratio. (DNA sequences only.) Can be a fixed positive value (ex:4.0) or e to get the maximum-likelihood estimate. """, equate=False, ), _Option( ["-v", "--pinv", "prop_invar"], """Proportion of invariable sites. Can be a fixed value in the range [0,1], or 'e' to get the maximum-likelihood estimate. """, equate=False, ), _Option( ["-c", "--nclasses", "nclasses"], """Number of relative substitution rate categories. Default 1. Must be a positive integer. """, equate=False, ), _Option( ["-a", "--alpha", "alpha"], """Distribution of the gamma distribution shape parameter. Can be a fixed positive value, or 'e' to get the maximum-likelihood estimate. """, equate=False, ), _Option( ["-s", "--search", "search"], """Tree topology search operation option. Can be one of: NNI : default, fast SPR : a bit slower than NNI BEST : best of NNI and SPR search """, checker_function=lambda x: x in ("NNI", "SPR", "BEST"), equate=False, ), # alt name: user_tree_file _Option( ["-u", "--inputtree", "input_tree"], "Starting tree filename. The tree must be in Newick format.", filename=True, equate=False, ), _Option( ["-o", "optimize"], r"""Specific parameter optimisation. tlr : tree topology (t), branch length (l) and rate parameters (r) are optimised. tl : tree topology and branch length are optimised. lr : branch length and rate parameters are optimised. l : branch length are optimised. r : rate parameters are optimised. n : no parameter is optimised. """, equate=False, ), _Switch( ["--rand_start", "rand_start"], """Sets the initial tree to random. Only valid if SPR searches are to be performed. """, ), _Option( ["--n_rand_starts", "n_rand_starts"], """Number of initial random trees to be used. Only valid if SPR searches are to be performed. """, equate=False, ), _Option( ["--r_seed", "r_seed"], """Seed used to initiate the random number generator. Must be an integer. """, equate=False, ), _Switch( ["--print_site_lnl", "print_site_lnl"], r"Print the likelihood for each site in file \*_phyml_lk.txt.", ), _Switch( ["--print_trace", "print_trace"], r""" Print each phylogeny explored during the tree search process in file \*_phyml_trace.txt.""", ), _Option( ["--run_id", "run_id"], """Append the given string at the end of each PhyML output file. This option may be useful when running simulations involving PhyML. """, checker_function=lambda x: isinstance(x, basestring), equate=False, ), # XXX should this always be set to True? _Switch( ["--quiet", "quiet"], "No interactive questions (for running in batch mode).", ), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="fdnadist", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-sequence", "sequence"], "seq file to use (phylip)", filename=True, is_required=True, ), _Option(["-method", "method"], "sub. model [f,k,j,l,s]", is_required=True), _Option(["-gamma", "gamma"], "gamma [g, i,n]"), _Option( ["-ncategories", "ncategories"], "number of rate catergories (1-9)" ), _Option(["-rate", "rate"], "rate for each category"), _Option( ["-categories", "categories"], "File of substitution rate categories" ), _Option(["-weights", "weights"], "weights file"), _Option( ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)" ), _Option(["-invarfrac", "invarfrac"], "proportoin of invariant sites"), _Option(["-ttratio", "ttratio"], "ts/tv ratio"), _Option(["-freqsfrom", "freqsfrom"], "use emprical base freqs"), _Option(["-basefreq", "basefreq"], "specify basefreqs"), _Option(["-lower", "lower"], "lower triangle matrix (y/N)"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="eprimer3", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-sequence", "sequence"], "Sequence to choose primers from.", is_required=True, ), _Option(["-task", "task"], "Tell eprimer3 what task to perform."), _Option( ["-hybridprobe", "hybridprobe"], "Find an internal oligo to use as a hyb probe.", ), _Option( ["-numreturn", "numreturn"], "Maximum number of primer pairs to return." ), _Option( ["-includedregion", "includedregion"], "Subregion of the sequence in which to pick primers.", ), _Option(["-target", "target"], "Sequence to target for flanking primers."), _Option( ["-excludedregion", "excludedregion"], "Regions to exclude from primer picking.", ), _Option( ["-forwardinput", "forwardinput"], "Sequence of a forward primer to check.", ), _Option( ["-reverseinput", "reverseinput"], "Sequence of a reverse primer to check.", ), _Option( ["-gcclamp", "gcclamp"], "The required number of Gs and Cs at the 3' of each primer.", ), _Option(["-osize", "osize"], "Optimum length of a primer oligo."), _Option(["-minsize", "minsize"], "Minimum length of a primer oligo."), _Option(["-maxsize", "maxsize"], "Maximum length of a primer oligo."), _Option( ["-otm", "otm"], "Melting temperature for primer oligo (OBSOLETE).\n\n" "Option replaced in EMBOSS 6.6.0 by -opttm", ), _Option( ["-opttm", "opttm"], "Optimum melting temperature for a primer oligo.\n\n" "Option added in EMBOSS 6.6.0, replacing -otm", ), _Option( ["-mintm", "mintm"], "Minimum melting temperature for a primer oligo." ), _Option( ["-maxtm", "maxtm"], "Maximum melting temperature for a primer oligo." ), _Option( ["-maxdifftm", "maxdifftm"], "Maximum difference in melting temperatures between " "forward and reverse primers.", ), _Option(["-ogcpercent", "ogcpercent"], "Optimum GC% for a primer."), _Option(["-mingc", "mingc"], "Minimum GC% for a primer."), _Option(["-maxgc", "maxgc"], "Maximum GC% for a primer."), _Option( ["-saltconc", "saltconc"], "Millimolar salt concentration in the PCR." ), _Option( ["-dnaconc", "dnaconc"], "Nanomolar concentration of annealing oligos in the PCR.", ), _Option( ["-maxpolyx", "maxpolyx"], "Maximum allowable mononucleotide repeat length in a primer.", ), # Primer length: _Option(["-psizeopt", "psizeopt"], "Optimum size for the PCR product."), _Option( ["-prange", "prange"], "Acceptable range of length for the PCR product." ), # Primer temperature: _Option( ["-ptmopt", "ptmopt"], "Optimum melting temperature for the PCR product.", ), _Option( ["-ptmmin", "ptmmin"], "Minimum allowed melting temperature for the amplicon.", ), _Option( ["-ptmmax", "ptmmax"], "Maximum allowed melting temperature for the amplicon.", ), # Note to self, should be -oexcludedregion not -oexcluderegion _Option( ["-oexcludedregion", "oexcludedregion"], "Do not pick internal oligos in this region.", ), _Option(["-oligoinput", "oligoinput"], "Sequence of the internal oligo."), # Oligo length: _Option(["-osizeopt", "osizeopt"], "Optimum length of internal oligo."), _Option(["-ominsize", "ominsize"], "Minimum length of internal oligo."), _Option(["-omaxsize", "omaxsize"], "Maximum length of internal oligo."), # Oligo GC temperature: _Option( ["-otmopt", "otmopt"], "Optimum melting temperature of internal oligo." ), _Option( ["-otmmin", "otmmin"], "Minimum melting temperature of internal oligo." ), _Option( ["-otmmax", "otmmax"], "Maximum melting temperature of internal oligo." ), # Oligo GC percent: _Option(["-ogcopt", "ogcopt"], "Optimum GC% for internal oligo."), _Option(["-ogcmin", "ogcmin"], "Minimum GC% for internal oligo."), _Option(["-ogcmax", "ogcmax"], "Maximum GC% for internal oligo."), # Oligo salt concentration: _Option( ["-osaltconc", "osaltconc"], "Millimolar concentration of salt in the hybridisation.", ), _Option( ["-odnaconc", "odnaconc"], "Nanomolar concentration of internal oligo in the hybridisation.", ), # Oligo self complementarity _Option( ["-oanyself", "oanyself"], "Maximum allowable alignment score for self-complementarity.", ), _Option( ["-oendself", "oendself"], "Max 3'-anchored self-complementarity global alignment score.", ), _Option( ["-opolyxmax", "opolyxmax"], "Maximum length of mononucleotide repeat in internal oligo.", ), _Option( ["-mispriminglibraryfile", "mispriminglibraryfile"], "File containing library of sequences to avoid amplifying", ), _Option( ["-maxmispriming", "maxmispriming"], "Maximum allowed similarity of primers to sequences in " "library specified by -mispriminglibrary", ), _Option( ["-omishybmax", "omishybmax"], "Maximum alignment score for hybridisation of internal oligo to " "library specified by -mishyblibraryfile.", ), _Option( ["-mishyblibraryfile", "mishyblibraryfile"], "Library file of seqs to avoid internal oligo hybridisation.", ), _Option( ["-explainflag", "explainflag"], "Produce output tags with eprimer3 statistics", ), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="clustalw", **kwargs): """Initialize the class.""" self.parameters = [ _Option(["-infile", "-INFILE", "INFILE", "infile"], "Input sequences.", filename=True), _Option(["-profile1", "-PROFILE1", "PROFILE1", "profile1"], "Profiles (old alignment).", filename=True), _Option(["-profile2", "-PROFILE2", "PROFILE2", "profile2"], "Profiles (old alignment).", filename=True), # ################# VERBS (do things) ############################# _Switch(["-options", "-OPTIONS", "OPTIONS", "options"], "List the command line parameters"), _Switch(["-help", "-HELP", "HELP", "help"], "Outline the command line params."), _Switch(["-check", "-CHECK", "CHECK", "check"], "Outline the command line params."), _Switch(["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"], "Output full help content."), _Switch(["-align", "-ALIGN", "ALIGN", "align"], "Do full multiple alignment."), _Switch(["-tree", "-TREE", "TREE", "tree"], "Calculate NJ tree."), _Switch([ "-pim", "-PIM", "PIM", "pim" ], "Output percent identity matrix (while calculating the tree)."), _Option( ["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"], "Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).", checker_function=lambda x: isinstance(x, int)), _Switch(["-convert", "-CONVERT", "CONVERT", "convert"], "Output the input sequences in a different file format."), # #################### PARAMETERS (set things) ######################### # ***General settings:**** # Makes no sense in biopython # _Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"], # [], # lambda x: 0, # Does not take value # False, # "read command line, then enter normal interactive menus", # False), _Switch(["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"], "Use FAST algorithm for the alignment guide tree"), _Option(["-type", "-TYPE", "TYPE", "type"], "PROTEIN or DNA sequences", checker_function=lambda x: x in ["PROTEIN", "DNA", "protein", "dna"]), _Switch(["-negative", "-NEGATIVE", "NEGATIVE", "negative"], "Protein alignment with negative values in matrix"), _Option(["-outfile", "-OUTFILE", "OUTFILE", "outfile"], "Output sequence alignment file name", filename=True), _Option( ["-output", "-OUTPUT", "OUTPUT", "output"], "Output format: CLUSTAL(default), GCG, GDE, PHYLIP, PIR, NEXUS and FASTA", checker_function=lambda x: x in [ "CLUSTAL", "GCG", "GDE", "PHYLIP", "PIR", "NEXUS", "FASTA", "clustal", "gcg", "gde", "phylip", "pir", "nexus", "fasta" ]), _Option(["-outorder", "-OUTORDER", "OUTORDER", "outorder"], "Output taxon order: INPUT or ALIGNED", checker_function=lambda x: x in ["INPUT", "input", "ALIGNED", "aligned"]), _Option(["-case", "-CASE", "CASE", "case"], "LOWER or UPPER (for GDE output only)", checker_function=lambda x: x in ["UPPER", "upper", "LOWER", "lower"]), _Option( ["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"], "OFF or ON (for Clustal output only)", checker_function=lambda x: x in ["ON", "on", "OFF", "off"]), _Option( ["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"], "OFF or ON (NEW- for all output formats)", checker_function=lambda x: x in ["ON", "on", "OFF", "off"]), _Option(["-range", "-RANGE", "RANGE", "range"], "Sequence range to write starting m to m+n. " "Input as string eg. '24,200'"), _Option(["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"], "Maximum allowed input sequence length", checker_function=lambda x: isinstance(x, int)), _Switch(["-quiet", "-QUIET", "QUIET", "quiet"], "Reduce console output to minimum"), _Option(["-stats", "-STATS", "STATS", "stats"], "Log some alignment statistics to file", filename=True), # ***Fast Pairwise Alignments:*** _Option(["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"], "Word size", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"], "Number of best diags.", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-window", "-WINDOW", "WINDOW", "window"], "Window around best diags.", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"], "Gap penalty", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-score", "-SCORE", "SCORE", "score"], "Either: PERCENT or ABSOLUTE", checker_function=lambda x: x in ["percent", "PERCENT", "absolute", "ABSOLUTE"]), # ***Slow Pairwise Alignments:*** _Option( ["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"], "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename", checker_function=lambda x: (x in [ "BLOSUM", "PAM", "GONNET", "ID", "blosum", "pam", "gonnet", "id" ] or os.path.exists(x)), filename=True), _Option( ["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"], "DNA weight matrix=IUB, CLUSTALW or filename", checker_function=lambda x: (x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path.exists( x)), filename=True), _Option(["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"], "Gap opening penalty", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"], "Gap extension penalty", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), # ***Multiple Alignments:*** _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"], "Output file name for newly created guide tree", filename=True), _Option(["-usetree", "-USETREE", "USETREE", "usetree"], "File name of guide tree", checker_function=lambda x: os.path.exists, filename=True), _Option( ["-matrix", "-MATRIX", "MATRIX", "matrix"], "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename", checker_function=lambda x: (x in [ "BLOSUM", "PAM", "GONNET", "ID", "blosum", "pam", "gonnet", "id" ] or os.path.exists(x)), filename=True), _Option(["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"], "DNA weight matrix=IUB, CLUSTALW or filename", checker_function=lambda x: (x in ["IUB", "CLUSTALW", "iub", "clustalw"] or os.path. exists(x)), filename=True), _Option(["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"], "Gap opening penalty", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-gapext", "-GAPEXT", "GAPEXT", "gapext"], "Gap extension penalty", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Switch(["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"], "No end gap separation pen."), _Option(["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"], "Gap separation pen. range", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Switch(["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"], "Residue-specific gaps off"), _Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"], "Hydrophilic gaps off"), _Switch([ "-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues" ], "List hydrophilic res."), _Option(["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"], "% ident. for delay", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), # Already handled in General Settings section, but appears a second # time under Multiple Alignments in the help # _Option(["-type", "-TYPE", "TYPE", "type"], # "PROTEIN or DNA", # checker_function=lambda x: x in ["PROTEIN", "DNA", # "protein", "dna"]), _Option( ["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"], "Transitions weighting", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option( ["-iteration", "-ITERATION", "ITERATION", "iteration"], "NONE or TREE or ALIGNMENT", checker_function=lambda x: x in ["NONE", "TREE", "ALIGNMENT", "none", "tree", "alignment"]), _Option(["-numiter", "-NUMITER", "NUMITER", "numiter"], "maximum number of iterations to perform", checker_function=lambda x: isinstance(x, int)), _Switch(["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"], "Disable sequence weighting"), # ***Profile Alignments:*** _Switch(["-profile", "-PROFILE", "PROFILE", "profile"], "Merge two alignments by profile alignment"), _Option(["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"], "Output file name for new guide tree of profile1", filename=True), _Option(["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"], "Output file for new guide tree of profile2", filename=True), _Option(["-usetree1", "-USETREE1", "USETREE1", "usetree1"], "File name of guide tree for profile1", checker_function=lambda x: os.path.exists, filename=True), _Option(["-usetree2", "-USETREE2", "USETREE2", "usetree2"], "File name of guide tree for profile2", checker_function=lambda x: os.path.exists, filename=True), # ***Sequence to Profile Alignments:*** _Switch( ["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"], "Sequentially add profile2 sequences to profile1 alignment"), # These are already handled in the Multiple Alignments section, # but appear a second time here in the help. # _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"], # "File for new guide tree", # filename=True), # _Option(["-usetree", "-USETREE", "USETREE", "usetree"], # "File for old guide tree", # checker_function=lambda x: os.path.exists, # filename=True), # ***Structure Alignments:*** _Switch( ["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"], "Do not use secondary structure-gap penalty mask for profile 1" ), _Switch( ["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"], "Do not use secondary structure-gap penalty mask for profile 2" ), _Option( ["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"], "STRUCTURE or MASK or BOTH or NONE output in alignment file", checker_function=lambda x: x in [ "STRUCTURE", "MASK", "BOTH", "NONE", "structure", "mask", "both", "none" ]), _Option(["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"], "Gap penalty for helix core residues", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"], "gap penalty for strand core residues", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option(["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"], "Gap penalty for loop regions", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option( ["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"], "Gap penalty for structure termini", checker_function=lambda x: (isinstance(x, int) or isinstance(x, float))), _Option( ["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"], "Number of residues inside helix to be treated as terminal", checker_function=lambda x: isinstance(x, int)), _Option( ["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"], "Number of residues outside helix to be treated as terminal", checker_function=lambda x: isinstance(x, int)), _Option( ["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"], "Number of residues inside strand to be treated as terminal", checker_function=lambda x: isinstance(x, int)), _Option( [ "-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout" ], "Number of residues outside strand to be treated as terminal", checker_function=lambda x: isinstance(x, int)), # ***Trees:*** _Option(["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"], "nj OR phylip OR dist OR nexus", checker_function=lambda x: x in [ "NJ", "PHYLIP", "DIST", "NEXUS", "nj", "phylip", "dist", "nexus" ]), _Option(["-seed", "-SEED", "SEED", "seed"], "Seed number for bootstraps.", checker_function=lambda x: isinstance(x, int)), _Switch(["-kimura", "-KIMURA", "KIMURA", "kimura"], "Use Kimura's correction."), _Switch(["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"], "Ignore positions with gaps."), _Option( ["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"], "Node OR branch position of bootstrap values in tree display", checker_function=lambda x: x in ["NODE", "BRANCH", "node", "branch"]), _Option( ["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"], "NJ or UPGMA", checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"]) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="raxmlHPC", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-a", "weight_filename"], "Name of a column weight file to assign individual weights " "to each column of the alignment. Those weights must be " "integers separated by any type and number of whitespaces " "within a separate file.", filename=True, equate=False, ), _Option(["-b", "bootstrap_seed"], "Random seed for bootstrapping.", equate=False), _Option( ["-c", "num_categories"], "Number of distinct rate categories for RAxML when " "evolution model is set to GTRCAT or GTRMIX." "Individual per-site rates are categorized into this " "many rate categories to accelerate computations. " "Default: 25.", equate=False, ), _Switch( ["-d", "random_starting_tree"], "Start ML optimization from random starting tree.", ), _Option( ["-e", "epsilon"], "Set model optimization precision in log likelihood units " "for final optimization of tree topology under MIX/MIXI " "or GAMMA/GAMMAI." "Default: 0.1 for models not using proportion of " "invariant sites estimate; 0.001 for models using " "proportion of invariant sites estimate.", equate=False, ), _Option( ["-E", "exclude_filename"], "An exclude file name, containing a specification of " "alignment positions you wish to exclude. Format is " "similar to Nexus, the file shall contain entries like " "'100-200 300-400'; to exclude a single column write, " "e.g., '100-100'. If you use a mixed model, an " "appropriately adapted model file will be written.", filename=True, equate=False, ), _Option( ["-f", "algorithm"], r""" Select algorithm: a: Rapid Bootstrap analysis and search for best-scoring ML tree in one program run. b: Draw bipartition information on a tree provided with '-t' based on multiple trees (e.g. form a bootstrap) in a file specifed by '-z'. c: Check if the alignment can be properly read by RAxML. d: New rapid hill-climbing (DEFAULT). e: Optimize model+branch lengths for given input tree under GAMMA/GAMMAI only. g: Compute per site log Likelihoods for one ore more trees passed via '-z' and write them to a file that can be read by CONSEL. h: Compute log likelihood test (SH-test) between best tree passed via '-t' and a bunch of other trees passed via '-z'. i: Perform a really thorough bootstrap, refinement of final bootstrap tree under GAMMA and a more exhaustive algorithm. j: Generate a bunch of bootstrapped alignment files from an original alignemnt file. m: Compare bipartitions between two bunches of trees passed via '-t' and '-z' respectively. This will return the Pearson correlation between all bipartitions found in the two tree files. A file called RAxML_bipartitionFrequencies.outputFileName will be printed that contains the pair-wise bipartition frequencies of the two sets. n: Compute the log likelihood score of all trees contained in a tree file provided by '-z' under GAMMA or GAMMA+P-Invar. o: Old and slower rapid hill-climbing. p: Perform pure stepwise MP addition of new sequences to an incomplete starting tree. s: Split up a multi-gene partitioned alignment into the respective subalignments. t: Do randomized tree searches on one fixed starting tree. w: Compute ELW test on a bunch of trees passed via '-z'. x: Compute pair-wise ML distances, ML model parameters will be estimated on an MP starting tree or a user-defined tree passed via '-t', only allowed for GAMMA-based models of rate heterogeneity. """, checker_function=( lambda x: isinstance(x, str) and len(x) == 1), equate=False, ), _Option( ["-g", "grouping_constraint"], "File name of a multifurcating constraint tree. " "this tree does not need to be comprehensive, i.e. " "contain all taxa.", filename=True, equate=False, ), _Option( ["-i", "rearrangements"], "Initial rearrangement setting for the subsequent " "application of topological changes phase.", equate=False, ), _Switch( ["-j", "checkpoints"], "Write checkpoints (intermediate tree topologies).", ), _Switch( ["-k", "bootstrap_branch_lengths"], "Print bootstrapped trees with branch lengths. " "The bootstraps will run a bit longer, because model " "parameters will be optimized at the end of each run. " "Use with CATMIX/PROTMIX or GAMMA/GAMMAI.", ), _Option( ["-l", "cluster_threshold"], "Threshold for sequence similarity clustering. " "RAxML will then print out an alignment to a file " "called sequenceFileName.reducedBy.threshold that " "only contains sequences <= the specified threshold " "that must be between 0.0 and 1.0. RAxML uses the " "QT-clustering algorithm to perform this task. " "In addition, a file called " "RAxML_reducedList.outputFileName will be written " "that contains clustering information.", equate=False, ), _Option( ["-L", "cluster_threshold_fast"], "Same functionality as '-l', but uses a less " "exhaustive and thus faster clustering algorithm. " "This is intended for very large datasets with more " "than 20,000-30,000 sequences.", equate=False, ), _Option( ["-m", "model"], r"""Model of Nucleotide or Amino Acid Substitution: NUCLEOTIDES: GTRCAT : GTR + Optimization of substitution rates + Optimization of site-specific evolutionary rates which are categorized into numberOfCategories distinct rate categories for greater computational efficiency if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program will use GTRMIX instead GTRGAMMA : GTR + Optimization of substitution rates + GAMMA model of rate heterogeneity (alpha parameter will be estimated) GTRMIX : Inference of the tree under GTRCAT and thereafter evaluation of the final tree topology under GTRGAMMA GTRCAT_GAMMA : Inference of the tree with site-specific evolutionary rates. However, here rates are categorized using the 4 discrete GAMMA rates. Evaluation of the final tree topology under GTRGAMMA GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of invariable sites GTRMIXI : Same as GTRMIX, but with estimate of proportion of invariable sites GTRCAT_GAMMAI : Same as GTRCAT_GAMMA, but with estimate of proportion of invariable sites AMINO ACIDS: PROTCATmatrixName[F] : specified AA matrix + Optimization of substitution rates + Optimization of site-specific evolutionary rates which are categorized into numberOfCategories distinct rate categories for greater computational efficiency if you do a multiple analysis with '-#' or '-N' but without bootstrapping the program will use PROTMIX... instead PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of substitution rates + GAMMA model of rate heterogeneity (alpha parameter will be estimated) PROTMIXmatrixName[F] : Inference of the tree under specified AA matrix + CAT and thereafter evaluation of the final tree topology under specified AA matrix + GAMMA PROTCAT_GAMMAmatrixName[F] : Inference of the tree under specified AA matrix and site-specific evolutionary rates. However, here rates are categorized using the 4 discrete GAMMA rates. Evaluation of the final tree topology under specified AA matrix + GAMMA PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but with estimate of proportion of invariable sites PROTMIXImatrixName[F] : Same as PROTMIXmatrixName[F], but with estimate of proportion of invariable sites PROTCAT_GAMMAImatrixName[F] : Same as PROTCAT_GAMMAmatrixName[F], but with estimate of proportion of invariable sites Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV, CPREV, VT, BLOSUM62, MTMAM, GTR With the optional 'F' appendix you can specify if you want to use empirical base frequencies Please not that for mixed models you can in addition specify the per-gene AA model in the mixed model file (see manual for details) """, equate=False, ), _Switch( ["-M", "partition_branch_lengths"], "Switch on estimation of individual per-partition " "branch lengths. Only has effect when used in " "combination with 'partition_filename' ('-q'). " "Branch lengths for individual partitions will be " "printed to separate files. A weighted average of the " "branch lengths is computed by using the respective " "partition lengths. ", ), _Option( ["-n", "name"], "Name used in the output files.", filename=True, equate=False, ), _Option( ["-o", "outgroup"], "Name of a single outgroup or a comma-separated list " "of outgroups, eg '-o Rat' or '-o Rat,Mouse'. In case " "that multiple outgroups are not monophyletic the " "first name in the list will be selected as outgroup. " "Don't leave spaces between taxon names!", checker_function=lambda x: len(x.split()) == 1, equate=False, ), _Option( ["-q", "partition_filename"], "File name containing the assignment of models to " "alignment partitions for multiple models of " "substitution. For the syntax of this file please " "consult the RAxML manual.", filename=True, equate=False, ), _Option( ["-p", "parsimony_seed"], "Random number seed for the parsimony inferences. " "This allows you to reproduce your results and will " "help developers debug the program. This option HAS " "NO EFFECT in the parallel MPI version.", equate=False, ), _Option( ["-P", "protein_model"], "File name of a user-defined AA (Protein) substitution " "model. This file must contain 420 entries, the first " "400 being the AA substitution rates (this must be a " "symmetric matrix) and the last 20 are the empirical " "base frequencies.", filename=True, equate=False, ), _Option( ["-r", "binary_constraint"], "File name of a binary constraint tree. " "This tree does not need to be comprehensive, i.e. " "contain all taxa.", filename=True, equate=False, ), _Option( ["-s", "sequences"], "Name of the alignment data file, in PHYLIP format.", filename=True, equate=False, ), _Option( ["-t", "starting_tree"], "File name of a user starting tree, in Newick format.", filename=True, equate=False, ), _Option( ["-T", "threads"], "Number of threads to run. " "PTHREADS VERSION ONLY! " "Make sure to set this at most the number of CPUs " "you have on your machine, otherwise, there will be " "a huge performance decrease!", equate=False, ), _Option( ["-u", "num_bootstrap_searches"], "Number of multiple bootstrap searches per replicate. " "Use this to obtain better ML trees for each " "replicate. Default: 1 ML search per bootstrap " "replicate.", equate=False, ), _Switch(["-v", "version"], "Display version information."), _Option( ["-w", "working_dir"], "Name of the working directory where RAxML will " "write its output files. Default: current directory.", filename=True, equate=False, ), _Option( ["-x", "rapid_bootstrap_seed"], "Random seed for rapid bootstrapping.", equate=False, ), _Switch( ["-y", "parsimony"], "Only compute a parsimony starting tree, then exit.", ), _Option( ["-z", "bipartition_filename"], "Name of a file containing multiple trees, e.g. from " "a bootstrap run, that shall be used to draw " "bipartition values onto a tree provided with '-t'. " "It can also be used to compute per-site log " "likelihoods in combination with '-f g', and to read " "a bunch of trees for a couple of other options " "('-f h', '-f m', '-f n').", filename=True, equate=False, ), _Option( ["-N", "-#", "num_replicates"], "Number of alternative runs on distinct starting trees. " "In combination with the '-b' option, this will invoke a " "multiple bootstrap analysis. " "DEFAULT: 1 single analysis." "Note that '-N' has been added as an alternative since " "'-#' sometimes caused problems with certain MPI job " "submission systems, since '-#' is often used to start " "comments. ", equate=False, ), ] AbstractCommandline.__init__(self, cmd, **kwargs) # ENH: enforce -s, -n and -m if not self.parsimony_seed: self.parsimony_seed = 10000
def __init__(self, cmd="bwa", **kwargs): self.program_name = cmd self.parameters = [ _StaticArgument("aln"), _Argument(["reference"], "Reference file name", filename=True, is_required=True), _Argument(["read_file"], "Read file name", filename=True, is_required=True), _Option( ["-n", "n"], "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", checker_function=lambda x: isinstance(x, (int, float)), equate=False), _Option( ["-o", "o"], "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", checker_function=lambda x: isinstance(x, (int, float)), equate=False), _Option( ["-e", "e"], "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-d", "d"], "Disallow a long deletion within INT bp towards the 3-end [16]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-i", "i"], "Disallow an indel within INT bp towards the ends [5]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-l", "l"], """Take the first INT subsequence as seed. If INT is larger than the query sequence, seeding will be disabled. For long reads, this option is typically ranged from 25 to 35 for -k 2. [inf]""", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-k", "k"], "Maximum edit distance in the seed [2]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-t", "t"], "Number of threads (multi-threading mode) [1]", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-M", "M"], "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-O", "O"], "Gap open penalty [11]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-E", "E"], "Gap extension penalty [4]", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-R", "R"], """Proceed with suboptimal alignments if there are no more than INT equally best hits. This option only affects paired-end mapping. Increasing this threshold helps to improve the pairing accuracy at the cost of speed, especially for short reads (~32bp).""", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-q", "q"], """Parameter for read trimming [0]. BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original read length.""", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-B", "B"], "Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]", checker_function=lambda x: isinstance(x, int), equate=False), _Switch([ "-c", "c" ], "Reverse query but not complement it, which is required for alignment in the color space." ), _Switch([ "-N", "N" ], "Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default." ), _Switch([ "-I", "I" ], "The input is in the Illumina 1.3+ read format (quality equals ASCII-64)." ), _Switch(["-b", "b"], "Specify the input read sequence file is the BAM format"), _Switch([ "-b1", "b1" ], "When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads)." ), _Switch([ "-b2", "b2" ], "When -b is specified, only use the second read in a read pair in mapping." ) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="bwa", **kwargs): self.program_name = cmd self.parameters = [ _StaticArgument("bwasw"), _Argument(["reference"], "Reference file name", filename=True, is_required=True), _Argument(["read_file"], "Read file", filename=True, is_required=True), _Argument(["mate_file"], "Mate file", filename=True, is_required=False), _Option(["-a", "a"], "Score of a match [1]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-b", "b"], "Mismatch penalty [3]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-q", "q"], "Gap open penalty [5]", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-r", "r"], "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-t", "t"], "Number of threads in the multi-threading mode [1]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-w", "w"], "Band width in the banded alignment [33]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-T", "T"], "Minimum score threshold divided by a [37]", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-c", "c"], """Coefficient for threshold adjustment according to query length [5.5]. Given an l-long query, the threshold for a hit to be retained is a*max{T,c*log(l)}.""", checker_function=lambda x: isinstance(x, float), equate=False), _Option( ["-z", "z"], "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-s", "s"], """Maximum SA interval size for initiating a seed [3]. Higher -s increases accuracy at the cost of speed.""", checker_function=lambda x: isinstance(x, int), equate=False), _Option( ["-N", "N"], "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", checker_function=lambda x: isinstance(x, int), equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="clustalo", **kwargs): """Initialize the class.""" # order parameters in the same order as clustalo --help self.parameters = [ # Sequence Input _Option( ["-i", "--in", "--infile", "infile"], "Multiple sequence input file", filename=True, equate=False, ), _Option( ["--hmm-in", "HMM input", "hmm_input"], "HMM input files", filename=True, equate=False, ), _Switch(["--dealign", "dealign"], "Dealign input sequences"), _Option( ["--profile1", "--p1", "profile1"], "Pre-aligned multiple sequence file (aligned columns will be kept fix).", filename=True, equate=False, ), _Option( ["--profile2", "--p2", "profile2"], "Pre-aligned multiple sequence file (aligned columns will be kept fix).", filename=True, equate=False, ), _Option( ["-t", "--seqtype", "seqtype"], "{Protein, RNA, DNA} Force a sequence type (default: auto).", equate=False, checker_function=lambda x: x in ["protein", "rna", "dna", "Protein", "RNA", "DNA", "PROTEIN"], ), _Switch( ["--is-profile", "isprofile"], "disable check if profile, force profile (default no)", ), _Option( ["--infmt", "infmt"], """Forced sequence input file format (default: auto) Allowed values: a2m, fa[sta], clu[stal], msf, phy[lip], selex, st[ockholm], vie[nna] """, equate=False, checker_function=lambda x: x in [ "a2m", "fa", "fasta", "clu", "clustal", "msf", "phy", "phylip", "selex", "st", "stockholm", "vie", "vienna", ], ), # Clustering _Option( ["--distmat-in", "distmat_in"], "Pairwise distance matrix input file (skips distance computation).", filename=True, equate=False, ), _Option( ["--distmat-out", "distmat_out"], "Pairwise distance matrix output file.", filename=True, equate=False, ), _Option( ["--guidetree-in", "guidetree_in"], "Guide tree input file (skips distance computation and guide-tree clustering step).", filename=True, equate=False, ), _Option( ["--guidetree-out", "guidetree_out"], "Guide tree output file.", filename=True, equate=False, ), _Switch( ["--full", "distmat_full"], "Use full distance matrix for guide-tree calculation (slow; mBed is default)", ), _Switch( ["--full-iter", "distmat_full_iter"], "Use full distance matrix for guide-tree calculation during iteration (mBed is default)", ), _Option( ["--cluster-size", "clustersize"], "soft maximum of sequences in sub-clusters", checker_function=lambda x: isinstance(x, int), ), _Option( ["--clustering-out", "clusteringout"], "Clustering output file", filename=True, ), _Switch( ["--use-kimura", "usekimura"], "use Kimura distance correction for aligned sequences (default no)", ), _Switch( ["--percent-id", "percentid"], "convert distances into percent identities (default no)", ), # Alignment Output _Option( ["-o", "--out", "--outfile", "outfile"], "Multiple sequence alignment output file (default: stdout).", filename=True, equate=False, ), _Option( ["--outfmt", "outfmt"], "MSA output file format:" " a2m=fa[sta],clu[stal],msf,phy[lip],selex,st[ockholm],vie[nna]" " (default: fasta).", equate=False, checker_function=lambda x: x in [ "a2m", "fa", "fasta", "clu", "clustal", "msf", "phy", "phylip", "selex", "st", "stockholm", "vie", "vienna", ], ), _Switch( ["--residuenumber", "--resno", "residuenumber"], "in Clustal format print residue numbers (default no)", ), _Option( ["--wrap", "wrap"], "number of residues before line-wrap in output", checker_function=lambda x: isinstance(x, int), ), _Option( ["--output-order", "outputorder"], "MSA output order like in input/guide-tree", checker_function=lambda x: x in ["input-order", "tree-order"], ), # Iteration _Option( ["--iterations", "--iter", "iterations"], "Number of (combined guide-tree/HMM) iterations", equate=False, checker_function=lambda x: isinstance(x, int), ), _Option( ["--max-guidetree-iterations", "max_guidetree_iterations"], "Maximum number of guidetree iterations", equate=False, checker_function=lambda x: isinstance(x, int), ), _Option( ["--max-hmm-iterations", "max_hmm_iterations"], "Maximum number of HMM iterations", equate=False, checker_function=lambda x: isinstance(x, int), ), # Limits (will exit early, if exceeded): _Option( ["--maxnumseq", "maxnumseq"], "Maximum allowed number of sequences", equate=False, checker_function=lambda x: isinstance(x, int), ), _Option( ["--maxseqlen", "maxseqlen"], "Maximum allowed sequence length", equate=False, checker_function=lambda x: isinstance(x, int), ), # Miscellaneous: _Switch( ["--auto", "auto"], "Set options automatically (might overwrite some of your options)", ), _Option( ["--threads", "threads"], "Number of processors to use", equate=False, checker_function=lambda x: isinstance(x, int), ), _Option( ["-l", "--log", "log"], "Log all non-essential output to this file.", filename=True, equate=False, ), _Switch(["-h", "--help", "help"], "Print help and exit."), _Switch(["-v", "--verbose", "verbose"], "Verbose output"), _Switch(["--version", "version"], "Print version information and exit"), _Switch( ["--long-version", "long_version"], "Print long version information and exit", ), _Switch(["--force", "force"], "Force file overwriting."), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="novoalign", **kwargs): READ_FORMAT = ['FA', 'SLXFQ', 'STDFQ', 'ILMFQ', 'PRB', 'PRBnSEQ'] REPORT_FORMAT = ['Native', 'Pairwise', 'SAM'] REPEAT_METHOD = ['None', 'Random', 'All', 'Exhaustive', '0.99'] self.parameters = \ [ _Option(["-d", "database"], "database filename", filename=True, equate=False), _Option(["-f", "readfile"], "read file", filename=True, equate=False), _Option(["-F", "format"], "Format of read files.\n\nAllowed values: %s" % ", ".join(READ_FORMAT), checker_function=lambda x: x in READ_FORMAT, equate=False), # Alignment scoring options _Option(["-t", "threshold"], "Threshold for alignment score", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-g", "gap_open"], "Gap opening penalty [default: 40]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-x", "gap_extend"], "Gap extend penalty [default: 15]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-u", "unconverted"], "Experimental: unconverted cytosines penalty in bisulfite mode\n\n" "Default: no penalty", checker_function=lambda x: isinstance(x, int), equate=False), # Quality control and read filtering _Option(["-l", "good_bases"], "Minimum number of good quality bases [default: log(N_g, 4) + 5]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-h", "homopolymer"], "Homopolymer read filter [default: 20; disable: negative value]", checker_function=lambda x: isinstance(x, int), equate=False), # Read preprocessing options _Option(["-a", "adapter3"], "Strips a 3' adapter sequence prior to alignment.\n\n" "With paired ends two adapters can be specified", checker_function=lambda x: isinstance(x, str), equate=False), _Option(["-n", "truncate"], "Truncate to specific length before alignment", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-s", "trimming"], "If fail to align, trim by s bases until they map or become shorter than l.\n\n" "Ddefault: 2", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-5", "adapter5"], "Strips a 5' adapter sequence.\n\n" "Similar to -a (adaptor3), but on the 5' end.", checker_function=lambda x: isinstance(x, str), equate=False), # Reporting options _Option(["-o", "report"], "Specifies the report format.\n\nAllowed values: %s\nDefault: Native" % ", ".join(REPORT_FORMAT), checker_function=lambda x: x in REPORT_FORMAT, equate=False), _Option(["-Q", "quality"], "Lower threshold for an alignment to be reported [default: 0]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-R", "repeats"], "If score difference is higher, report repeats.\n\n" "Otherwise -r read method applies [default: 5]", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-r", "r_method"], "Methods to report reads with multiple matches.\n\n" "Allowed values: %s\n" "'All' and 'Exhaustive' accept limits." % ", ".join(REPEAT_METHOD), checker_function=lambda x: x.split()[0] in REPEAT_METHOD, equate=False), _Option(["-e", "recorded"], "Alignments recorded with score equal to the best.\n\n" "Default: 1000 in default read method, otherwise no limit.", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-q", "qual_digits"], "Decimal digits for quality scores [default: 0]", checker_function=lambda x: isinstance(x, int), equate=False), # Paired end options _Option(["-i", "fragment"], "Fragment length (2 reads + insert) and standard deviation [default: 250 30]", checker_function=lambda x: len(x.split()) == 2, equate=False), _Option(["-v", "variation"], "Structural variation penalty [default: 70]", checker_function=lambda x: isinstance(x, int), equate=False), # miRNA mode _Option(["-m", "miRNA"], "Sets miRNA mode and optionally sets a value for the region scanned [default: off]", checker_function=lambda x: isinstance(x, int), equate=False), # Multithreading _Option(["-c", "cores"], "Number of threads, disabled on free versions [default: number of cores]", checker_function=lambda x: isinstance(x, int), equate=False), # Quality calibrations _Option(["-k", "read_cal"], "Read quality calibration from file (mismatch counts)", checker_function=lambda x: isinstance(x, str), equate=False), _Option(["-K", "write_cal"], "Accumulate mismatch counts and write to file", checker_function=lambda x: isinstance(x, str), equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="bbcontacts", **kwargs): # TODO: figure a way to group CL arguments as in `mutually_exclusive_group` if "dssp_file" in list(kwargs.keys()) and "psipred_file" in list( kwargs.keys()): msg = "Provide only one of [dssp_file|psipred_file]!" raise RuntimeError(msg) elif not ("dssp_file" in list(kwargs.keys()) or "psipred_file" in list(kwargs.keys())): msg = "Provide one of [dssp_file|psipred_file]!" raise RuntimeError(msg) self.parameters = [ _Option(["-c", "config_file"], "bbcontacts configuration file", filename=True, equate=False), _Option( ["-s", "smoothing_size"], "Perform local background correction of the coupling matrix " "before decoding: from each coupling, subtract the average " "coupling (smoothed background) over an area extending by " "SMOOTHINGSIZE in each direction [default=10, use 0 for no " "local background correction]", equate=False, ), _Switch( ["-l", "long_predictions"], "Turn off (slow) prediction-shortening mode (this mode is on " "by default but will only get triggered when long predictions occur)", ), _Option( ["-n", "pdb_name"], "Provide a PDB identifier (when also using -e, this will be the " "PDB name to look for in EVALUATIONFILE)", equate=False, ), _Option( ["-e", "evaluation_file"], "Provide a file containing the true contacts (BetaSheet916.dat, " "BetaSheet1452.dat or same format) for evaluation", filename=True, equate=False, ), _Argument(["matfile"], "CCMpred-like coupling matrix", filename=True, is_required=True), _Argument(["diversity_score"], "sequence-dependent diversity score", is_required=True), _Argument(["prefix"], "output prefix", is_required=True), _Option(["-d", "dssp_file"], "DSSP secondary structure prediction file", filename=True, equate=False), _Option(["-p", "psipred_file"], "PSIPRED secondary structure prediction file", filename=True, equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="fseqboot", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-sequence", "sequence"], "seq file to sample (phylip)", filename=True, is_required=True, ), _Option(["-categories", "catergories"], "file of input categories"), _Option(["-weights", "weights"], " weights file"), _Option(["-test", "test"], "specify operation, default is bootstrap"), _Option(["-regular", "regular"], "absolute number to resample"), _Option(["-fracsample", "fracsample"], "fraction to resample"), _Option( ["-rewriteformat", "rewriteformat"], "output format ([P]hyilp, [n]exus, [x]ml", ), _Option(["-seqtype", "seqtype"], "output format ([D]na, [p]rotein, [r]na"), _Option(["-blocksize", "blocksize"], "print progress (Y/n)"), _Option(["-reps", "reps"], "how many replicates, defaults to 100)"), _Option( ["-justweights", "jusweights"], "what to write out [D]atasets of just [w]eights", ), _Option(["-seed", "seed"], "specify random seed"), _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="muscle", **kwargs): """Initialize the class.""" CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", "kmer4_6"] DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ ["pctid_kimura", "pctid_log"] OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] SEQUENCE_TYPES = ["protein", "nucleo", "auto"] WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", "gsc", "threeway"] self.parameters = [ # Can't use "in" as the final alias as this # is a reserved word in python: _Option(["-in", "in", "input"], "Input filename", filename=True, equate=False), _Option(["-out", "out"], "Output filename", filename=True, equate=False), _Switch(["-diags", "diags"], "Find diagonals (faster for similar sequences)"), _Switch(["-profile", "profile"], "Perform a profile alignment"), _Option(["-in1", "in1"], "First input filename for profile alignment", filename=True, equate=False), _Option(["-in2", "in2"], "Second input filename for a profile alignment", filename=True, equate=False), # anchorspacing Integer 32 Minimum spacing # between anchor cols _Option(["-anchorspacing", "anchorspacing"], "Minimum spacing between anchor columns", checker_function=lambda x: isinstance(x, int), equate=False), # center Floating point [1] Center parameter. # Should be negative. _Option(["-center", "center"], "Center parameter - should be negative", checker_function=lambda x: isinstance(x, float), equate=False), # cluster1 upgma upgmb Clustering method. _Option(["-cluster1", "cluster1"], "Clustering method used in iteration 1", checker_function=lambda x: x in CLUSTERING_ALGORITHMS, equate=False), # cluster2 upgmb cluster1 is used # neighborjoining in iteration 1 and # 2, cluster2 in # later iterations. _Option(["-cluster2", "cluster2"], "Clustering method used in iteration 2", checker_function=lambda x: x in CLUSTERING_ALGORITHMS, equate=False), # diaglength Integer 24 Minimum length of # diagonal. _Option(["-diaglength", "diaglength"], "Minimum length of diagonal", checker_function=lambda x: isinstance(x, int), equate=True), # diagmargin Integer 5 Discard this many # positions at ends # of diagonal. _Option(["-diagmargin", "diagmargin"], "Discard this many positions at ends of diagonal", checker_function=lambda x: isinstance(x, int), equate=False), # distance1 kmer6_6 Kmer6_6(amino) or Distance measure # kmer20_3 Kmer4_6(nucleo) for iteration 1 # kmer20_4 # kbit20_3 # kmer4_6 _Option(["-distance1", "distance1"], "Distance measure for iteration 1", checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, equate=False), # distance2 kmer6_6 pctid_kimura Distance measure # kmer20_3 for iterations # kmer20_4 2, 3 ... # kbit20_3 # pctid_kimura # pctid_log _Option(["-distance2", "distance2"], "Distance measure for iteration 2", checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, equate=False), # gapextend Floating point [1] The gap extend score _Option(["-gapextend", "gapextend"], "Gap extension penalty", checker_function=lambda x: isinstance(x, float), equate=False), # gapopen Floating point [1] The gap open score # Must be negative. _Option(["-gapopen", "gapopen"], "Gap open score - negative number", checker_function=lambda x: isinstance(x, float), equate=False), # hydro Integer 5 Window size for # determining whether # a region is # hydrophobic. _Option(["-hydro", "hydro"], "Window size for hydrophobic region", checker_function=lambda x: isinstance(x, int), equate=False), # hydrofactor Floating point 1.2 Multiplier for gap # open/close # penalties in # hydrophobic regions _Option(["-hydrofactor", "hydrofactor"], "Multiplier for gap penalties in hydrophobic regions", checker_function=lambda x: isinstance(x, float), equate=False), # log File name None. Log file name # (delete existing # file). _Option(["-log", "log"], "Log file name", filename=True, equate=False), # loga File name None. Log file name # (append to existing # file). _Option(["-loga", "loga"], "Log file name (append to existing file)", filename=True, equate=False), # matrix File name None. File name for # substitution matrix # in NCBI or WU-BLAST # format. If you # specify your own # matrix, you should # also specify: # -gapopen <g> # -gapextend <e> # -center 0.0 _Option(["-matrix", "matrix"], "path to NCBI or WU-BLAST format protein substitution " "matrix - also set -gapopen, -gapextend and -center", filename=True, equate=False), # diagbreak Integer 1 Maximum distance # between two # diagonals that # allows them to # merge into one # diagonal. _Option(["-diagbreak", "diagbreak"], "Maximum distance between two diagonals that allows " "them to merge into one diagonal", checker_function=lambda x: isinstance(x, int), equate=False), _Option(["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8 "Deprecated in v3.8, use -diagbreak instead.", checker_function=lambda x: isinstance(x, int), equate=False), # maxhours Floating point None. Maximum time to # run in hours. The # actual time may # exceed requested # limit by a few # minutes. Decimals # are allowed, so 1.5 # means one hour and # 30 minutes. _Option(["-maxhours", "maxhours"], "Maximum time to run in hours", checker_function=lambda x: isinstance(x, float), equate=False), # maxiters Integer 1, 2 ... 16 Maximum number of # iterations. _Option(["-maxiters", "maxiters"], "Maximum number of iterations", checker_function=lambda x: isinstance(x, int), equate=False), # maxtrees Integer 1 Maximum number of # new trees to build # in iteration 2. _Option(["-maxtrees", "maxtrees"], "Maximum number of trees to build in iteration 2", checker_function=lambda x: isinstance(x, int), equate=False), # minbestcolscore Floating point [1] Minimum score a # column must have to # be an anchor. _Option(["-minbestcolscore", "minbestcolscore"], "Minimum score a column must have to be an anchor", checker_function=lambda x: isinstance(x, float), equate=False), # minsmoothscore Floating point [1] Minimum smoothed # score a column must # have to be an # anchor. _Option(["-minsmoothscore", "minsmoothscore"], "Minimum smoothed score a column must have to " "be an anchor", checker_function=lambda x: isinstance(x, float), equate=False), # objscore sp spm Objective score # ps used by tree # dp dependent # xp refinement. # spf sp=sum-of-pairs # spm score. (dimer # approximation) # spm=sp for < 100 # seqs, otherwise spf # dp=dynamic # programming score. # ps=average profile- # sequence score. # xp=cross profile # score. _Option(["-objscore", "objscore"], "Objective score used by tree dependent refinement", checker_function=lambda x: x in OBJECTIVE_SCORES, equate=False), # refinewindow Integer 200 Length of window # for -refinew. _Option(["-refinewindow", "refinewindow"], "Length of window for -refinew", checker_function=lambda x: isinstance(x, int), equate=False), # root1 pseudo pseudo Method used to root _Option(["-root1", "root1"], "Method used to root tree in iteration 1", checker_function=lambda x: x in TREE_ROOT_METHODS, equate=False), # root2 midlongestspan tree; root1 is # minavgleafdist used in iteration 1 # and 2, root2 in # later iterations. _Option(["-root2", "root2"], "Method used to root tree in iteration 2", checker_function=lambda x: x in TREE_ROOT_METHODS, equate=False), # scorefile File name None File name where to # write a score file. # This contains one # line for each column # in the alignment. # The line contains # the letters in the # column followed by # the average BLOSUM62 # score over pairs of # letters in the # column. _Option(["-scorefile", "scorefile"], "Score file name, contains one line for each column" " in the alignment with average BLOSUM62 score", filename=True, equate=False), # seqtype protein auto Sequence type. # nucleo # auto _Option(["-seqtype", "seqtype"], "Sequence type", checker_function=lambda x: x in SEQUENCE_TYPES, equate=False), # smoothscoreceil Floating point [1] Maximum value of # column score for # smoothing purposes. _Option(["-smoothscoreceil", "smoothscoreceil"], "Maximum value of column score for smoothing", checker_function=lambda x: isinstance(x, float), equate=False), # smoothwindow Integer 7 Window used for # anchor column # smoothing. _Option(["-smoothwindow", "smoothwindow"], "Window used for anchor column smoothing", checker_function=lambda x: isinstance(x, int), equate=False), # spscore File name Compute SP # objective score of # multiple alignment. _Option(["-spscore", "spscore"], "Compute SP objective score of multiple alignment", filename=True, equate=False), # SUEFF Floating point value 0.1 Constant used in # between 0 and 1. UPGMB clustering. # Determines the # relative fraction # of average linkage # (SUEFF) vs. nearest # neighbor linkage # (1 SUEFF). _Option(["-sueff", "sueff"], "Constant used in UPGMB clustering", checker_function=lambda x: isinstance(x, float), equate=False), # tree1 File name None Save tree _Option(["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False), # tree2 first or second # iteration to given # file in Newick # (Phylip-compatible) # format. _Option(["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False), # usetree File name None Use given tree as # guide tree. Must by # in Newick # (Phyip-compatible) # format. _Option(["-usetree", "usetree"], "Use given Newick tree as guide tree", filename=True, equate=False), # weight1 none clustalw Sequence weighting _Option(["-weight1", "weight1"], "Weighting scheme used in iteration 1", checker_function=lambda x: x in WEIGHTING_SCHEMES, equate=False), # weight2 henikoff scheme. # henikoffpb weight1 is used in # gsc iterations 1 and 2. # clustalw weight2 is used for # threeway tree-dependent # refinement. # none=all sequences # have equal weight. # henikoff=Henikoff & # Henikoff weighting # scheme. # henikoffpb=Modified # Henikoff scheme as # used in PSI-BLAST. # clustalw=CLUSTALW # method. # threeway=Gotoh # three-way method. _Option(["-weight2", "weight2"], "Weighting scheme used in iteration 2", checker_function=lambda x: x in WEIGHTING_SCHEMES, equate=False), # ################### FORMATS #################################### # Multiple formats can be specified on the command line # If -msf appears it will be used regardless of other formats # specified. If -clw appears (and not -msf), clustalw format will # be used regardless of other formats specified. If both -clw and # -clwstrict are specified -clwstrict will be used regardless of # other formats specified. If -fasta is specified and not -msf, # -clw, or clwstrict, fasta will be used. If -fasta and -html are # specified -fasta will be used. Only if -html is specified alone # will html be used. I kid ye not. # clw no Write output in CLUSTALW format # (default is FASTA). _Switch(["-clw", "clw"], "Write output in CLUSTALW format (with a MUSCLE header)"), # clwstrict no Write output in CLUSTALW format with # the "CLUSTAL W (1.81)" header rather # than the MUSCLE version. This is # useful when a post-processing step is # picky about the file header. _Switch(["-clwstrict", "clwstrict"], "Write output in CLUSTALW format with version" "1.81 header"), # fasta yes Write output in FASTA format. # Alternatives include clw, # clwstrict, msf and html. _Switch(["-fasta", "fasta"], "Write output in FASTA format"), # html no Write output in HTML format (default # is FASTA). _Switch(["-html", "html"], "Write output in HTML format"), # msf no Write output in MSF format (default # is FASTA). _Switch(["-msf", "msf"], "Write output in MSF format"), # Phylip interleaved - undocumented as of 3.7 _Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"), # Phylip sequential - undocumented as of 3.7 _Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"), # ################# Additional specified output files ######### _Option(["-phyiout", "phyiout"], "Write PHYLIP interleaved output to specified filename", filename=True, equate=False), _Option(["-physout", "physout"], "Write PHYLIP sequential format to specified filename", filename=True, equate=False), _Option(["-htmlout", "htmlout"], "Write HTML output to specified filename", filename=True, equate=False), _Option(["-clwout", "clwout"], "Write CLUSTALW output (with MUSCLE header) to specified " "filename", filename=True, equate=False), _Option(["-clwstrictout", "clwstrictout"], "Write CLUSTALW output (with version 1.81 header) to " "specified filename", filename=True, equate=False), _Option(["-msfout", "msfout"], "Write MSF format output to specified filename", filename=True, equate=False), _Option(["-fastaout", "fastaout"], "Write FASTA format output to specified filename", filename=True, equate=False), # ############# END FORMATS ################################### # anchors yes Use anchor optimization in tree # dependent refinement iterations. _Switch(["-anchors", "anchors"], "Use anchor optimisation in tree dependent " "refinement iterations"), # noanchors no Disable anchor optimization. Default # is anchors. _Switch(["-noanchors", "noanchors"], "Do not use anchor optimisation in tree dependent " "refinement iterations"), # brenner no Use Steven Brenner's method for # computing the root alignment. _Switch(["-brenner", "brenner"], "Use Steve Brenner's root alignment method"), # cluster no Perform fast clustering of input # sequences. Use the tree1 option to # save the tree. _Switch(["-cluster", "cluster"], "Perform fast clustering of input sequences, " "use -tree1 to save tree"), # dimer no Use dimer approximation for the # SP score (faster, less accurate). _Switch(["-dimer", "dimer"], "Use faster (slightly less accurate) dimer approximation" "for the SP score"), # group yes Group similar sequences together # in the output. This is the default. # See also stable. _Switch(["-group", "group"], "Group similar sequences in output"), # ############# log-expectation profile score #################### # One of either -le, -sp, or -sv # # According to the doc, spn is default and the only option for # nucleotides: this doesn't appear to be true. -le, -sp, and -sv # can be used and produce numerically different logs # (what is going on?) # # spn fails on proteins # le maybe Use log-expectation profile score # (VTML240). Alternatives are to use sp # or sv. This is the default for amino # acid sequences. _Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"), # sv no Use sum-of-pairs profile score # (VTML240). Default is le. _Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"), # sp no Use sum-of-pairs protein profile # score (PAM200). Default is le. _Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"), # spn maybe Use sum-of-pairs nucleotide profile # score (BLASTZ parameters). This is # the only option for nucleotides, # and is therefore the default. _Switch(["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score"), # ########## END log-expectation profile score ################### # quiet no Do not display progress messages. _Switch(["-quiet", "quiet"], "Do not display progress messages"), # refine no Input file is already aligned, skip # first two iterations and begin tree # dependent refinement. _Switch(["-refine", "refine"], "Only do tree dependent refinement"), # refinew no Refine an alignment by dividing it # into non-overlapping windows and # re-aligning each window. Typically # used for whole-genome nucleotide # alignments. _Switch(["-refinew", "refinew"], "Only do tree dependent refinement using " "sliding window approach"), # core yes in muscle, Do not catch exceptions. # no in muscled. _Switch(["-core", "core"], "Do not catch exceptions"), # nocore no in muscle, Catch exceptions and give an # yes in muscled. error message if possible. _Switch(["-nocore", "nocore"], "Catch exceptions"), # stable no Preserve input order of sequences # in output file. Default is to group # sequences by similarity (group). _Switch(["-stable", "stable"], "Do not group similar sequences in output " "(not supported in v3.8)"), # termgaps4 yes Use 4-way test for treatment of # terminal gaps. # (Cannot be disabled in this version). # # termgapsfull no Terminal gaps penalized with # full penalty. [1] Not fully # supported in this version # # termgapshalf yes Terminal gaps penalized with # half penalty. [1] Not fully # supported in this version # # termgapshalflonger no Terminal gaps penalized with # half penalty if gap relative # to longer sequence, otherwise with # full penalty. [1] Not fully # supported in this version # # verbose no Write parameter settings and # progress messages to log file. _Switch(["-verbose", "verbose"], "Write parameter settings and progress"), # version no Write version string to # stdout and exit _Switch(["-version", "version"], "Write version string to stdout and exit"), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="prank", **kwargs): """Initialize the class.""" OUTPUT_FORMAT_VALUES = list(range(1, 18)) self.parameters = [ # ################# input/output parameters: ################## # -d=sequence_file _Option(["-d", "d"], "Input filename", filename=True, is_required=True), # -t=tree_file [default: no tree, generate approximate NJ tree] _Option(["-t", "t"], "Input guide tree filename", filename=True), # -tree="tree_string" [tree in newick format; in double quotes] _Option(["-tree", "tree"], "Input guide tree as Newick string"), # -m=model_file [default: HKY2/WAG] _Option( ["-m", "m"], "User-defined alignment model filename. Default: HKY2/WAG"), # -o=output_file [default: 'output'] _Option( ["-o", "o"], "Output filenames prefix. Default: 'output'\n " "Will write: output.?.fas (depending on requested " "format), output.?.xml and output.?.dnd", filename=True, ), # -f=output_format [default: 8] _Option( ["-f", "f"], "Output alignment format. Default: 8 FASTA\n" "Option are:\n" "1. IG/Stanford 8. Pearson/Fasta\n" "2. GenBank/GB 11. Phylip3.2\n" "3. NBRF 12. Phylip\n" "4. EMBL 14. PIR/CODATA\n" "6. DNAStrider 15. MSF\n" "7. Fitch 17. PAUP/NEXUS", checker_function=lambda x: x in OUTPUT_FORMAT_VALUES, ), _Switch( ["-noxml", "noxml"], "Do not output XML files (PRANK versions earlier than v.120626)", ), _Switch( ["-notree", "notree"], "Do not output dnd tree files (PRANK versions earlier than v.120626)", ), _Switch(["-showxml", "showxml"], "Output XML files (PRANK v.120626 and later)"), _Switch( ["-showtree", "showtree"], "Output dnd tree files (PRANK v.120626 and later)", ), _Switch(["-shortnames", "shortnames"], "Truncate names at first space"), _Switch(["-quiet", "quiet"], "Reduce verbosity"), # ###################### model parameters: ###################### # +F [force insertions to be always skipped] # -F [equivalent] _Switch(["-F", "+F", "F"], "Force insertions to be always skipped: same as +F"), # -dots [show insertion gaps as dots] _Switch(["-dots", "dots"], "Show insertion gaps as dots"), # -gaprate=# [gap opening rate; default: dna 0.025 / prot 0.0025] _Option( ["-gaprate", "gaprate"], "Gap opening rate. Default: dna 0.025 prot 0.0025", checker_function=lambda x: isinstance(x, float), ), # -gapext=# [gap extension probability; default: dna 0.5 / prot 0.5] _Option( ["-gapext", "gapext"], "Gap extension probability. Default: dna 0.5 / prot 0.5", checker_function=lambda x: isinstance(x, float), ), # -dnafreqs=#,#,#,# [ACGT; default: empirical] _Option( ["-dnafreqs", "dnafreqs"], "DNA frequencies - 'A,C,G,T'. eg '25,25,25,25' as a quote " "surrounded string value. Default: empirical", checker_function=lambda x: isinstance(x, bytes), ), # -kappa=# [ts/tv rate ratio; default:2] _Option( ["-kappa", "kappa"], "Transition/transversion ratio. Default: 2", checker_function=lambda x: isinstance(x, int), ), # -rho=# [pur/pyr rate ratio; default:1] _Option( ["-rho", "rho"], "Purine/pyrimidine ratio. Default: 1", checker_function=lambda x: isinstance(x, int), ), # -codon [for DNA: use empirical codon model] _Switch(["-codon", "codon"], "Codon aware alignment or not"), # -termgap [penalise terminal gaps normally] _Switch(["-termgap", "termgap"], "Penalise terminal gaps normally"), # ############### other parameters: ################################ # -nopost [do not compute posterior support; default: compute] _Switch( ["-nopost", "nopost"], "Do not compute posterior support. Default: compute", ), # -pwdist=# [expected pairwise distance for computing guidetree; # default: dna 0.25 / prot 0.5] _Option( ["-pwdist", "pwdist"], "Expected pairwise distance for computing guidetree. " "Default: dna 0.25 / prot 0.5", checker_function=lambda x: isinstance(x, float), ), _Switch(["-once", "once"], "Run only once. Default: twice if no guidetree given"), _Switch(["-twice", "twice"], "Always run twice"), _Switch(["-skipins", "skipins"], "Skip insertions in posterior support"), _Switch( ["-uselogs", "uselogs"], "Slower but should work for a greater number of sequences", ), _Switch(["-writeanc", "writeanc"], "Output ancestral sequences"), _Switch(["-printnodes", "printnodes"], "Output each node; mostly for debugging"), # -matresize=# [matrix resizing multiplier] # Doesn't specify type but Float and Int work _Option( ["-matresize", "matresize"], "Matrix resizing multiplier", checker_function=lambda x: (isinstance(x, float) or isinstance(x, int)), ), # -matinitsize=# [matrix initial size multiplier] # Doesn't specify type but Float and Int work _Option( ["-matinitsize", "matinitsize"], "Matrix initial size multiplier", checker_function=lambda x: (isinstance(x, float) or isinstance(x, int)), ), _Switch(["-longseq", "longseq"], "Save space in pairwise alignments"), _Switch(["-pwgenomic", "pwgenomic"], "Do pairwise alignment, no guidetree"), # -pwgenomicdist=# [distance for pairwise alignment; default: 0.3] _Option( ["-pwgenomicdist", "pwgenomicdist"], "Distance for pairwise alignment. Default: 0.3", checker_function=lambda x: isinstance(x, float), ), # -scalebranches=# [scale branch lengths; default: dna 1 / prot 2] _Option( ["-scalebranches", "scalebranches"], "Scale branch lengths. Default: dna 1 / prot 2", checker_function=lambda x: isinstance(x, int), ), # -fixedbranches=# [use fixed branch lengths] # Assume looking for a float _Option( ["-fixedbranches", "fixedbranches"], "Use fixed branch lengths of input value", checker_function=lambda x: isinstance(x, float), ), # -maxbranches=# [set maximum branch length] # Assume looking for a float _Option( ["-maxbranches", "maxbranches"], "Use maximum branch lengths of input value", checker_function=lambda x: isinstance(x, float), ), # -realbranches [disable branch length truncation] _Switch(["-realbranches", "realbranches"], "Disable branch length truncation"), _Switch(["-translate", "translate"], "Translate to protein"), _Switch(["-mttranslate", "mttranslate"], "Translate to protein using mt table"), # ##################### other: #################### _Switch( ["-convert", "convert"], "Convert input alignment to new format. Do not perform alignment", ), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="muscle", **kwargs): CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] DISTANCE_MEASURES_ITER1 = [ "kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", "kmer4_6" ] DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ ["pctid_kimura", "pctid_log"] OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] SEQUENCE_TYPES = ["protein", "nucleo", "auto"] WEIGHTING_SCHEMES = [ "none", "clustalw", "henikoff", "henikoffpb", "gsc", "threeway" ] self.parameters = \ [ #Can't use "in" as the final alias as this is a reserved word in python: _Option(["-in", "in", "input"], ["input", "file"], None, 0, "Input filename", 0), #No equate _Option(["-out", "out"], ["output", "file"], None, 0, "Output filename", 0), #No equate _Switch(["-diags", "diags"], ["input"], "Find diagonals (faster for similar sequences)"), _Switch(["-profile", "profile"], ["input"], "Perform a profile alignment"), _Option(["-in1", "in1"], ["input", "file"], None, 0, "First input filename for profile alignment", 0), _Option(["-in2", "in2"], ["input", "file"], None, 0, "Second input filename for a profile alignment", 0), #anchorspacing Integer 32 Minimum spacing between _Option(["-anchorspacing", "anchorspacing"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Minimum spacing between anchor columns", 0), #center Floating point [1] Center parameter. # Should be negative. _Option(["-center", "center"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Center parameter - should be negative", 0), #cluster1 upgma upgmb Clustering method. _Option(["-cluster1", "cluster1"], ["input"], lambda x: x in CLUSTERING_ALGORITHMS, 0, "Clustering method used in iteration 1", 0), #cluster2 upgmb cluster1 is used in # neighborjoining iteration 1 and 2, # cluster2 in later # iterations. _Option(["-cluster2", "cluster2"], ["input"], lambda x: x in CLUSTERING_ALGORITHMS, 0, "Clustering method used in iteration 2", 0), #diaglength Integer 24 Minimum length of # diagonal. _Option(["-diaglength", "diaglength"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Minimum length of diagonal", 0), #diagmargin Integer 5 Discard this many # positions at ends of # diagonal. _Option(["-diagmargin", "diagmargin"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Discard this many positions at ends of diagonal", 0), #distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for # kmer20_3 Kmer4_6 (nucleo) iteration 1. # kmer20_4 # kbit20_3 # kmer4_6 _Option(["-distance1", "distance1"], ["input"], lambda x: x in DISTANCE_MEASURES_ITER1, 0, "Distance measure for iteration 1", 0), #distance2 kmer6_6 pctid_kimura Distance measure for # kmer20_3 iterations 2, 3 ... # kmer20_4 # kbit20_3 # pctid_kimura # pctid_log _Option(["-distance2", "distance2"], ["input"], lambda x: x in DISTANCE_MEASURES_ITER2, 0, "Distance measure for iteration 2", 0), #gapopen Floating point [1] The gap open score. # Must be negative. _Option(["-gapopen", "gapopen"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Gap open score - negative number", 0), #hydro Integer 5 Window size for # determining whether a # region is hydrophobic. _Option(["-hydro", "hydro"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Window size for hydrophobic region", 0), #hydrofactor Floating point 1.2 Multiplier for gap # open/close penalties in # hydrophobic regions. _Option(["-hydrofactor", "hydrofactor"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Multiplier for gap penalties in hydrophobic regions", 0), #log File name None. Log file name (delete # existing file). _Option(["-log", "log"], ["output", "file"], None, 0, "Log file name", 0), #loga File name None. Log file name (append # to existing file). _Option(["-loga", "loga"], ["output", "file"], None, 0, "Log file name (append to existing file)", 0), #maxdiagbreak Integer 1 Maximum distance # between two diagonals # that allows them to # merge into one # diagonal. _Option(["-maxdiagbreak", "maxdiagbreak"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Maximum distance between two diagonals that allows " + \ "them to merge into one diagonal", 0), #maxhours Floating point None. Maximum time to run in # hours. The actual time # may exceed the # requested limit by a # few minutes. Decimals # are allowed, so 1.5 # means one hour and 30 # minutes. _Option(["-maxhours", "maxhours"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Maximum time to run in hours", 0), #maxiters Integer 1, 2 ... 16 Maximum number of # iterations. _Option(["-maxiters", "maxiters"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Maximum number of iterations", 0), #maxtrees Integer 1 Maximum number of new # trees to build in # iteration 2. _Option(["-maxtrees", "maxtrees"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Maximum number of trees to build in iteration 2", 0), #minbestcolscore Floating point [1] Minimum score a column # must have to be an # anchor. _Option(["-minbestcolscore", "minbestcolscore"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Minimum score a column must have to be an anchor", 0), #minsmoothscore Floating point [1] Minimum smoothed score # a column must have to # be an anchor. _Option(["-minsmoothscore", "minsmoothscore"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Minimum smoothed score a column must have to " + \ "be an anchor", 0), #objscore sp spm Objective score used by # ps tree dependent # dp refinement. # xp sp=sum-of-pairs score. # spf spf=sum-of-pairs score # spm (dimer approximation) # spm=sp for < 100 seqs, # otherwise spf # dp=dynamic programming # score. # ps=average profile- # sequence score. # xp=cross profile score. _Option(["-objscore", "objscore"], ["input"], lambda x: x in OBJECTIVE_SCORES, 0, "Objective score used by tree dependent refinement", 0), #root1 pseudo psuedo Method used to root _Option(["-root1", "root1"], ["input"], lambda x: x in TREE_ROOT_METHODS, 0, "Method used to root tree in iteration 1", 0), #root2 midlongestspan tree; root1 is used in # minavgleafdist iteration 1 and 2, # root2 in later # iterations. _Option(["-root2", "root2"], ["input"], lambda x: x in TREE_ROOT_METHODS, 0, "Method used to root tree in iteration 2", 0), #seqtype protein auto Sequence type. # nucleo # auto _Option(["-seqtype", "seqtype"], ["input"], lambda x: x in SEQUENCE_TYPES, 0, "Sequence type", 0), #smoothscoreceil Floating point [1] Maximum value of column # score for smoothing # purposes. _Option(["-smoothscoreceil", "smoothscoreceil"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Maximum value of column score for smoothing", 0), #smoothwindow Integer 7 Window used for anchor # column smoothing. _Option(["-smoothwindow", "smoothwindow"], ["input"], lambda x: isinstance(x, types.IntType), 0, "Window used for anchor column smoothing", 0), #SUEFF Floating point value 0.1 Constant used in UPGMB # between 0 and 1. clustering. Determines # the relative fraction # of average linkage # (SUEFF) vs. nearest- # neighbor linkage (1 # SUEFF). _Option(["-sueff", "sueff"], ["input"], lambda x: isinstance(x, types.FloatType), 0, "Constant used in UPGMB clustering", 0), #tree1 File name None Save tree produced in _Option(["-tree1", "tree1"], ["input"], None, 0, "Save Newick tree from iteration 1", 0), #tree2 first or second # iteration to given file # in Newick (Phylip- # compatible) format. _Option(["-tree2", "tree2"], ["input"], None, 0, "Save Newick tree from iteration 2", 0), #weight1 none clustalw Sequence weighting _Option(["-weight1", "weight1"], ["input"], lambda x: x in WEIGHTING_SCHEMES, 0, "Weighting scheme used in iteration 1", 0), #weight2 henikoff scheme. # henikoffpb weight1 is used in # gsc iterations 1 and 2. # clustalw weight2 is used for # threeway tree-dependent # refinement. # none=all sequences have # equal weight. # henikoff=Henikoff & # Henikoff weighting # scheme. # henikoffpb=Modified # Henikoff scheme as used # in PSI-BLAST. # clustalw=CLUSTALW # method. # threeway=Gotoh three- # way method. _Option(["-weight2", "weight2"], ["input"], lambda x: x in WEIGHTING_SCHEMES, 0, "Weighting scheme used in iteration 2", 0), #################### FORMATS ####################################### # Multiple formats can be specified on the command line # If -msf appears it will be used regardless of other formats # specified. If -clw appears (and not -msf), clustalw format will be # used regardless of other formats specified. If both -clw and # -clwstrict are specified -clwstrict will be used regardless of # other formats specified. If -fasta is specified and not -msf, # -clw, or clwstrict, fasta will be used. If -fasta and -html are # specified -fasta will be used. Only if -html is specified alone # will html be used. I kid ye not. #clw no Write output in CLUSTALW format (default is # FASTA). _Switch(["-clw", "clw"], ["input"], "Write output in CLUSTALW format (with a MUSCLE header). " "If you want to parse the output with Bio.AlignIO in " "Biopython, use the clwstrict output switch instead."), #clwstrict no Write output in CLUSTALW format with the # "CLUSTAL W (1.81)" header rather than the # MUSCLE version. This is useful when a post- # processing step is picky about the file # header. _Switch(["-clwstrict", "clwstrict"], ["input"], "Write output in CLUSTALW format with vers. 1.81 header, " "this is useful for parsing with Bio.AlignIO in Biopython."), #fasta yes Write output in FASTA format. Alternatives # include clw, # clwstrict, msf and html. _Switch(["-fasta", "fasta"], ["input"], "Write output in FASTA format"), #html no Write output in HTML format (default is # FASTA). _Switch(["-html", "html"], ["input"], "Write output in HTML format"), #msf no Write output in MSF format (default is # FASTA). _Switch(["-msf", "msf"], ["input"], "Write output in MSF format"), ############## END FORMATS ################################### #anchors yes Use anchor optimization in tree dependent # refinement iterations. _Switch(["-anchors", "anchors"], ["input"], "Use anchor optimisation in tree dependent " + \ "refinement iterations"), #noanchors no Disable anchor optimization. Default is # anchors. _Switch(["-noanchors", "noanchors"], ["input"], "Do not use anchor optimisation in tree dependent " + \ "refinement iterations"), #group yes Group similar sequences together in the # output. This is the default. See also # stable. _Switch(["-group", "group"], ["input"], "Group similar sequences in output"), #stable no Preserve input order of sequences in output # file. Default is to group sequences by # similarity (group). _Switch(["-stable", "stable"], ["input"], "Do not group similar sequences in output"), ############## log-expectation profile score ###################### # One of either -le, -sp, or -sv # # According to the doc, spn is default and the only option for # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can # be used and produce numerically different logs (what is going on?) # #spn fails on proteins #le maybe Use log-expectation profile score (VTML240). # Alternatives are to use sp or sv. This is # the default for amino acid sequences. _Switch(["-le", "le"], ["input"], "Use log-expectation profile score (VTML240)"), #sv no Use sum-of-pairs profile score (VTML240). # Default is le. _Switch(["-sv", "sv"], ["input"], "Use sum-of-pairs profile score (VTML240)"), #sp no Use sum-of-pairs protein profile score # (PAM200). Default is le. _Switch(["-sp", "sp"], ["input"], "Use sum-of-pairs protein profile score (PAM200)"), #spn maybe Use sum-of-pairs nucleotide profile score # (BLASTZ parameters). This is the only option # for nucleotides, and is therefore the # default. _Switch(["-spn", "spn"], ["input"], "Use sum-of-pairs protein nucleotide profile score"), ############## END log-expectation profile score ###################### #quiet no Do not display progress messages. _Switch(["-quiet", "quiet"], ["input"], "Use sum-of-pairs protein nucleotide profile score"), #refine no Input file is already aligned, skip first # two iterations and begin tree dependent # refinement. _Switch(["-refine", "refine"], ["input"], "Only do tree dependent refinement"), #core yes in muscle, Do not catch exceptions. # no in muscled. _Switch(["-core", "core"], ["input"], "Catch exceptions"), #nocore no in muscle, Catch exceptions and give an error message # yes in muscled. if possible. _Switch(["-nocore", "nocore"], ["input"], "Do not catch exceptions"), #termgapsfull no Terminal gaps penalized with full penalty. # [1] Not fully supported in this version. # #termgapshalf yes Terminal gaps penalized with half penalty. # [1] Not fully supported in this version. # #termgapshalflonger no Terminal gaps penalized with half penalty if # gap relative to # longer sequence, otherwise with full # penalty. # [1] Not fully supported in this version. #verbose no Write parameter settings and progress # messages to log file. _Switch(["-verbose", "verbose"], ["input"], "Write parameter settings and progress"), #version no Write version string to stdout and exit. _Switch(["-version", "version"], ["input"], "Write version string to stdout and exit"), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="dialign2-2", **kwargs): self.program_name = cmd self.parameters = \ [ _Switch(["-afc", "afc"], "Creates additional output file '*.afc' " "containing data of all fragments considered " "for alignment WARNING: this file can be HUGE !"), _Switch(["-afc_v", "afc_v"], "Like '-afc' but verbose: fragments are explicitly " "printed. WARNING: this file can be EVEN BIGGER !"), _Switch(["-anc", "anc"], "Anchored alignment. Requires a file <seq_file>.anc " "containing anchor points."), _Switch(["-cs", "cs"], "If segments are translated, not only the `Watson " "strand' but also the `Crick strand' is looked at."), _Switch(["-cw", "cw"], "Additional output file in CLUSTAL W format."), _Switch(["-ds", "ds"], "`dna alignment speed up' - non-translated nucleic acid " "fragments are taken into account only if they start " "with at least two matches. Speeds up DNA alignment at " "the expense of sensitivity."), _Switch(["-fa", "fa"], "Additional output file in FASTA format."), _Switch(["-ff", "ff"], "Creates file *.frg containing information about all " "fragments that are part of the respective optimal " "pairwise alignmnets plus information about " "consistency in the multiple alignment"), _Option(["-fn", "fn"], "Output files are named <out_file>.<extension>.", equate=False), _Switch(["-fop", "fop"], "Creates file *.fop containing coordinates of all " "fragments that are part of the respective pairwise alignments."), _Switch(["-fsm", "fsm"], "Creates file *.fsm containing coordinates of all " "fragments that are part of the final alignment"), _Switch(["-iw", "iw"], "Overlap weights switched off (by default, overlap " "weights are used if up to 35 sequences are aligned). " "This option speeds up the alignment but may lead " "to reduced alignment quality."), _Switch(["-lgs", "lgs"], "`long genomic sequences' - combines the following " "options: -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff, " "-fop, -ff, -cs, -ds, -pst "), _Switch(["-lgs_t", "lgs_t"], "Like '-lgs' but with all segment pairs assessed " "at the peptide level (rather than 'mixed alignments' " "as with the '-lgs' option). Therefore faster than " "-lgs but not very sensitive for non-coding regions."), _Option(["-lmax", "lmax"], "Maximum fragment length = x (default: x = 40 or " "x = 120 for `translated' fragments). Shorter x " "speeds up the program but may affect alignment quality.", checker_function=lambda x: isinstance(x, int), equate=False), _Switch(["-lo", "lo"], "(Long Output) Additional file *.log with information " "about fragments selected for pairwise alignment and " "about consistency in multi-alignment procedure."), _Switch(["-ma", "ma"], "`mixed alignments' consisting of P-fragments and " "N-fragments if nucleic acid sequences are aligned."), _Switch(["-mask", "mask"], "Residues not belonging to selected fragments are " "replaced by `*' characters in output alignment " "(rather than being printed in lower-case characters)"), _Switch(["-mat", "mat"], "Creates file *mat with substitution counts derived " "from the fragments that have been selected for alignment."), _Switch(["-mat_thr", "mat_thr"], "Like '-mat' but only fragments with weight score " "> t are considered"), _Switch(["-max_link", "max_link"], "'maximum linkage' clustering used to construct " "sequence tree (instead of UPGMA)."), _Switch(["-min_link", "min_link"], "'minimum linkage' clustering used."), _Option(["-mot", "mot"], "'motif' option.", equate=False), _Switch(["-msf", "msf"], "Separate output file in MSF format."), _Switch(["-n", "n"], "Input sequences are nucleic acid sequences. " "No translation of fragments."), _Switch(["-nt", "nt"], "Input sequences are nucleic acid sequences and " "`nucleic acid segments' are translated to `peptide " "segments'."), _Switch(["-nta", "nta"], "`no textual alignment' - textual alignment suppressed. " "This option makes sense if other output files are of " "interest -- e.g. the fragment files created with -ff, " "-fop, -fsm or -lo."), _Switch(["-o", "o"], "Fast version, resulting alignments may be slightly " "different."), _Switch(["-ow", "ow"], "Overlap weights enforced (By default, overlap weights " "are used only if up to 35 sequences are aligned since " "calculating overlap weights is time consuming)."), _Switch(["-pst", "pst"], "'print status'. Creates and updates a file *.sta with " "information about the current status of the program " "run. This option is recommended if large data sets " "are aligned since it allows the user to estimate the " "remaining running time."), _Switch(["-smin", "smin"], "Minimum similarity value for first residue pair " "(or codon pair) in fragments. Speeds up protein " "alignment or alignment of translated DNA fragments " "at the expense of sensitivity."), _Option(["-stars", "stars"], "Maximum number of `*' characters indicating degree " "of local similarity among sequences. By default, no " "stars are used but numbers between 0 and 9, instead.", checker_function=lambda x: x in range(0, 10), equate=False), _Switch(["-stdo", "stdo"], "Results written to standard output."), _Switch(["-ta", "ta"], "Standard textual alignment printed (overrides " "suppression of textual alignments in special " "options, e.g. -lgs)"), _Option(["-thr", "thr"], "Threshold T = x.", checker_function=lambda x: isinstance(x, int), equate=False), _Switch(["-xfr", "xfr"], "'exclude fragments' - list of fragments can be " "specified that are NOT considered for pairwise alignment"), _Argument(["input"], "Input file name. Must be FASTA format", filename=True, is_required=True), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="hhblits", **kwargs): # TODO: Figure out how to do mutual groups if 'local' in list(kwargs.keys()) and 'global' in list(kwargs.keys()): raise ValueError( "Use only one of \"global_aln/local_aln\" alignment modes") self.parameters = [ _Option(['-i', 'input'], 'single sequence or multiple sequence alignment in ' 'a3m, a2m, or FASTA format, or HMM in hmm format', filename=True, is_required=True, equate=False), # Options _Option(['-d', 'database'], 'database name (e.g. uniprot20_29Feb2012)', is_required=True, equate=False), _Option(['-n', 'niterations'], 'number of iterations [default: 2]', equate=False), _Option( ['-e', 'evalue'], 'E-value cutoff for inclusion in result alignment [default: 0.001]', equate=False), # # Input alignment options # _Option(['-M', 'a2m'], # 'use A2M/A3M input alignment format', # equate=False), # _Option(['-M', 'fasta'], # 'use FASTA input alignment format', # equate=False), # _Option(['-M', 'match_states'], # 'use FASTA: columns with fewer than X% gaprs are match states', # equate=False), # Output options _Option( ['-o', 'output'], 'write results in standard format to file [default: <infile.hhr>]', filename=True, equate=False), _Option(['-oa3m', 'oa3m'], 'write result MSA with significant matches in a3m format', filename=True, equate=False), _Option(['-ohhm', 'ohhm'], 'write result MSA with significant matches in hmm format', filename=True, equate=False), _Option(['-opsi', 'opsi'], 'write result MSA with significant matches in psi format', filename=True, equate=False), _Option(['-oalis', 'oalis'], 'write MSAs in A3M format after each iteration', filename=True, equate=False), # Filter options applied to query MSA, database MSAs, and result MSA _Switch( ['-all', 'show_all'], 'show all sequences in result MSA; do not filter result MSA'), _Option(['-id', 'id'], 'maximum pairwise sequence identity [default: 90]', equate=False), _Option( ['-diff', 'diff'], 'filter MSAs by selecting most diverse set of sequences, keeping ' 'at least this many seqs in each MSA block of length 50 [default: 1000]', equate=False), _Option(['-cov', 'cov'], 'minimum coverage with master sequence (%) [default: 0]', equate=False), _Option( ['-qid', 'qid'], 'minimum sequence identity with master sequence (%) [default: 0]', equate=False), _Option( ['-qsc', 'qsc'], 'minimum score per column with master sequence [default: -20.0]', equate=False), _Option( ['-neff', 'neff'], 'target diversity of multiple sequence alignment [default: off]', equate=False), # HMM-HMM alignment options _Switch([ '-norealign', 'norealign' ], 'do NOT realign displayed hits with MAC algorithm [default: realign]' ), _Option( ['-mact', 'mac_realignment_threshold'], 'posterior probability threshold for MAC re-alignment [default: 0.350], ' 'Parameter controls alignment greediness: 0:global >0.1:local', equate=False), _Switch([ '-glob', 'global_aln' ], 'use global alignment mode for searching/ranking [default: local]' ), _Switch([ '-loc', 'loca_alnl' ], 'use local alignment mode for searching/ranking [default: local]' ), # Other options _Option( ['-v', 'verbose'], 'verbose mode: 0:no screen output 1:only warings 2: verbose [default: 2]', equate=False), _Option( ['-neffmax', 'neffmax'], 'skip further search iterations when diversity Neff of query ' 'MSA becomes larger than neffmax [default: 10.0]', equate=False), _Option( ['-cpu', 'cpu'], 'number of CPUs to use (for shared memory SMPs) [default: 2]'), # Extra options from `-h all` _Option( ['-maxfilt', 'maxfilt'], 'max number of hits allowed to pass 2nd prefilter (default=20000)', equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="XXmotif", **kwargs): # order of parameters is the same as in XXmotif --help _valid_alphabet = set("ACGTNX") self.parameters = \ [ _Argument(["outdir", "OUTDIR"], "output directory for all results", filename = True, is_required = True, # XXmotif currently does not accept spaces in the outdir name checker_function = lambda x: " " not in x), _Argument(["seqfile", "SEQFILE"], "file name with sequences from positive set in FASTA format", filename = True, is_required = True, # XXmotif currently only accepts a pure filename checker_function = lambda x: os.path.split(x)[0] == ""), # Options _Option(["--negSet", "negSet", "NEGSET", "negset"], "sequence set which has to be used as a reference set", filename = True, equate = False), _Switch(["--zoops", "ZOOPS", "zoops"], "use zero-or-one occurrence per sequence model (DEFAULT)"), _Switch(["--mops", "MOPS", "mops"], "use multiple occurrence per sequence model"), _Switch(["--oops", "OOPS", "oops"], "use one occurrence per sequence model"), _Switch(["--revcomp", "REVCOMP", "revcomp"], "search in reverse complement of sequences as well (DEFAULT: NO)"), _Option(["--background-model-order", "background-model-order", "BACKGROUND-MODEL-ORDER", "background_model_order"], "order of background distribution (DEFAULT: 2, 8(--negset) )", checker_function = lambda x: isinstance(x, int), equate = False), _Option(["--pseudo", "PSEUDO", "pseudo"], "percentage of pseudocounts used (DEFAULT: 10)", checker_function = lambda x: isinstance(x, int), equate = False), _Option(["-g", "--gaps", "GAPS", "gaps"], "maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)", checker_function = lambda x: x in [0-3], equate = False), _Option(["--type", "TYPE", "type"], "defines what kind of start seeds are used (DEFAULT: ALL)" "possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM", checker_function = lambda x: x in ["ALL", "all", "FIVEMERS", "fivemers", "PALINDROME", "palindrome", "TANDEM", "tandem", "NOPALINDROME", "nopalindrome", "NOTANDEM", "notandem"], equate = False), _Option(["--merge-motif-threshold", "merge-motif-threshold", "MERGE-MOTIF-THRESHOLD", "merge_motif_threshold"], "defines the similarity threshold for merging motifs (DEFAULT: HIGH)" "possible modes: LOW, MEDIUM, HIGH", checker_function = lambda x: x in ["LOW", "low", "MEDIUM", "medium", "HIGH", "high"], equate = False), _Switch(["--no-pwm-length-optimization", "no-pwm-length-optimization", "NO-PWM-LENGTH-OPTIMIZATION", "no_pwm_length_optimization"], "do not optimize length during iterations (runtime advantages)"), _Option(["--max-match-positions", "max-match-positions", "MAX-MATCH-POSITIONS", "max_match_positions"], "max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)", checker_function = lambda x: isinstance(x, int), equate = False), _Switch(["--batch", "BATCH", "batch"], "suppress progress bars (reduce output size for batch jobs)"), _Option(["--maxPosSetSize", "maxPosSetSize", "MAXPOSSETSIZE", "maxpossetsize"], "maximum number of sequences from the positive set used [DEFAULT: all]", checker_function = lambda x: isinstance(x, int), equate = False), # does not make sense in biopython #_Switch(["--help", "help", "HELP"], # "print this help page"), _Option(["--trackedMotif", "trackedMotif", "TRACKEDMOTIF", "trackedmotif"], "inspect extensions and refinement of a given seed (DEFAULT: not used)", checker_function = lambda x: any((c in _valid_alphabet) for c in x), equate = False), # Using conservation information _Option(["--format", "FORMAT", "format"], "defines what kind of format the input sequences have (DEFAULT: FASTA)", checker_function = lambda x: x in ["FASTA", "fasta", "MFASTA", "mfasta"], equate = False), _Option(["--maxMultipleSequences", "maxMultipleSequences", "MAXMULTIPLESEQUENCES", "maxmultiplesequences"], "maximum number of sequences used in an alignment [DEFAULT: all]", checker_function = lambda x: isinstance(x, int), equate = False), # Using localization information _Switch(["--localization", "LOCALIZATION", "localization"], "use localization information to calculate combined P-values" "(sequences should have all the same length)"), _Option(["--downstream", "DOWNSTREAM", "downstream"], "number of residues in positive set downstream of anchor point (DEFAULT: 0)", checker_function = lambda x: isinstance(x, int), equate = False), # Start with self defined motif _Option(["-m", "--startMotif", "startMotif", "STARTMOTIF", "startmotif"], "Start motif (IUPAC characters)", checker_function = lambda x: any((c in _valid_alphabet) for c in x), equate = False), _Option(["-p", "--profileFile", "profileFile", "PROFILEFILE", "profilefile"], "profile file", filename = True, equate = False), _Option(["--startRegion", "startRegion", "STARTREGION", "startregion"], "expected start position for motif occurrences relative to anchor point (--localization)", checker_function = lambda x: isinstance(x, int), equate = False), _Option(["--endRegion", "endRegion", "ENDREGION", "endregion"], "expected end position for motif occurrences relative to anchor point (--localization)", checker_function = lambda x: isinstance(x, int), equate = False), # XXmotif wrapper options _Switch(["--XXmasker", "masker"], "mask the input sequences for homology, repeats and low complexity regions"), _Switch(["--XXmasker-pos", "maskerpos"], "mask only the positive set for homology, repeats and low complexity regions"), _Switch(["--no-graphics", "nographics"], "run XXmotif without graphical output"), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="fdnapars", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-sequence", "sequence"], "seq file to use (phylip)", filename=True, is_required=True, ), _Option(["-intreefile", "intreefile"], "Phylip tree file"), _Option(["-weights", "weights"], "weights file"), _Option(["-maxtrees", "maxtrees"], "max trees to save during run"), _Option(["-thorough", "thorough"], "more thorough search (Y/n)"), _Option(["-rearrange", "rearrange"], "Rearrange on just 1 best tree (Y/n)"), _Option( ["-transversion", "transversion"], "Use tranversion parsimony (y/N)" ), _Option( ["-njumble", "njumble"], "number of times to randomise input order (default is 0)", ), _Option(["-seed", "seed"], "provide random seed"), _Option(["-outgrno", "outgrno"], "Specify outgroup"), _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"), _Option(["-threshold", "threshold"], "Threshold value"), _Option(["-trout", "trout"], "Write trees to file (Y/n)"), _Option(["-outtreefile", "outtreefile"], "filename for output tree"), _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="cd-hit", **kwargs): self.parameters = [ _Option( ['-i', 'input'], 'input filename in fasta format, required', filename=True, equate=False, is_required=True), _Option(['-o', 'output'], 'output filename, required', filename=True, equate=False, is_required=True), _Option( ['-c', 'seq_id_thres'], "sequence identity threshold, default 0.9 " "this is the default cd-hit's 'global sequence identity' calculated as: " "number of identical amino acids in alignment divided by " "the full length of the shorter sequence", equate=False), _Option( ['-G', 'global_seq_id'], "use global sequence identity, default 1 " "if set to 0, then use local sequence identity, calculated as : " "number of identical amino acids in alignment " "divided by the length of the alignment " "NOTE!!! don't use -G 0 unless you use alignment coverage controls " "see options -aL (kwarg: `cov_alignment_long`), -AL (kwarg: `cov_alignment_long_control`)," " -aS (kwarg: `cov_alignment_short`), -AS (kwarg: `cov_alignment_short_control`)", equate=False), _Option(['-b', 'band_width'], 'band_width of alignment, default 20', equate=False), _Option( ['-M', 'memory_limit'], 'memory limit (in MB) for the program, default 800; 0 for unlimited', equate=False), _Option(['-T', 'num_threads'], 'number of threads, default 1; with 0, all CPUs will be used', equate=False), _Option(['-n', 'word_length'], "word_length, default 5, see user's guide for choosing it", equate=False), _Option(['-l', 'len_throw_away_seqs'], "length of throw_away_sequences, default 10", equate=False), _Option(['-t', 'tol_4_redundance'], "tolerance for redundance, default 2", equate=False), _Option( ['-d', 'len_desc'], "length of description in .clstr file, default 20 " "if set to 0, it takes the fasta defline and stops at first space " "-s length difference cutoff, default 0.0", equate=False), _Option( ['-s', 'len_diff_cutoff'], "length difference cutoff, default 0.0 " "if set to 0.9, the shorter sequences need to be " "at least 90% length of the representative of the cluster", equate=False), _Option( ['-S', 'len_diff_cutoff_aa'], "length difference cutoff in amino acid, default 999999 " "if set to 60, the length difference between the shorter sequences " "and the representative of the cluster can not be bigger than 60", equate=False), _Option( ['-aL', 'cov_alignment_long'], "alignment coverage for the longer sequence, default 0.0 " "if set to 0.9, the alignment must covers 90% of the sequence", equate=False), _Option( ['-AL', 'cov_alignment_long_control'], "alignment coverage control for the longer sequence, default 99999999 " "if set to 60, and the length of the sequence is 400, " "then the alignment must be >= 340 (400-60) residues", equate=False), _Option( ['-aS', 'cov_alignment_short'], "alignment coverage for the shorter sequence, default 0.0 " "if set to 0.9, the alignment must covers 90% of the sequence", equate=False), _Option( ['-AS', 'cov_alignment_short_control'], "alignment coverage control for the shorter sequence, default 99999999 " "if set to 60, and the length of the sequence is 400, " "then the alignment must be >= 340 (400-60) residues", equate=False), _Option( ['-A', 'cov_alignment'], "minimal alignment coverage control for the both sequences, default 0 " "alignment must cover >= this value for both sequences", equate=False), _Option( ['-uL', 'max_unmatched_percentage_long'], "maximum unmatched percentage for the longer sequence, default 1.0 " "if set to 0.1, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10% of the sequence", equate=False), _Option( ['-uS', 'max_unmatched_percentage_short'], "maximum unmatched percentage for the shorter sequence, default 1.0 " "if set to 0.1, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10% of the sequence", equate=False), _Option( ['-U', 'len_max_unmatched'], "maximum unmatched length, default 99999999 " "if set to 10, the unmatched region (excluding leading and tailing gaps) " "must not be more than 10 bases", equate=False), _Option( ['-B', 'hdd_storage'], "1 or 0, default 0, by default, sequences are stored in RAM " "if set to 1, sequence are stored on hard drive " "it is recommended to use -B 1 for huge databases", equate=False), _Option( ['-p', 'aln_overlap_2_file'], "1 or 0, default 0 " "if set to 1, print alignment overlap in .clstr file", equate=False), _Option( ['-g', 'accurate_mode'], "1 or 0, default 0 " "by cd-hit's default algorithm, a sequence is clustered to the first " "cluster that meet the threshold (fast cluster). If set to 1, the program " "will cluster it into the most similar cluster that meet the threshold " "(accurate but slow mode) " "but either 1 or 0 won't change the representatives of final clusters", equate=False), _Option(['-bak', 'backup'], "write backup cluster file (1 or 0, default 0)", equate=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="fprotpars", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-sequence", "sequence"], "seq file to use (phylip)", filename=True, is_required=True, ), _Option(["-intreefile", "intreefile"], "Phylip tree file to score"), _Option( ["-outtreefile", "outtreefile"], "phylip tree output file", filename=True, is_required=True, ), _Option(["-weights", "weights"], "weights file"), _Option(["-whichcode", "whichcode"], "which genetic code, [U,M,V,F,Y]]"), _Option( ["-njumble", "njumble"], "number of times to randomise input order (default is 0)", ), _Option(["-seed", "seed"], "provide random seed"), _Option(["-outgrno", "outgrno"], "Specify outgroup"), _Option(["-thresh", "thresh"], "Use threshold parsimony (y/N)"), _Option(["-threshold", "threshold"], "Threshold value"), _Option(["-trout", "trout"], "Write trees to file (Y/n)"), _Option(["-dotdiff", "dotdiff"], "Use dot-differencing? [Y/n]"), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="fprotdist", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-sequence", "sequence"], "seq file to use (phylip)", filename=True, is_required=True, ), _Option( ["-ncategories", "ncategories"], "number of rate catergories (1-9)" ), _Option(["-rate", "rate"], "rate for each category"), _Option(["-catergories", "catergories"], "file of rates"), _Option(["-weights", "weights"], "weights file"), _Option(["-method", "method"], "sub. model [j,h,d,k,s,c]"), _Option(["-gamma", "gamma"], "gamma [g, i,c]"), _Option( ["-gammacoefficient", "gammacoefficient"], "value for gamma (> 0.001)" ), _Option( ["-invarcoefficient", "invarcoefficient"], "float for variation of substitution rate among sites", ), _Option(["-aacateg", "aacateg"], "Choose the category to use [G,C,H]"), _Option(["-whichcode", "whichcode"], "genetic code [c,m,v,f,y]"), _Option(["-ease", "ease"], "Pob change catergory (float between -0 and 1)"), _Option(["-ttratio", "ttratio"], "Transition/transversion ratio (0-1)"), _Option( ["-basefreq", "basefreq"], "DNA base frequencies (space separated list)" ), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="guidance", align=True, **kwargs): # order parameters in the same order as invoking guidance on the cmd line (e.g. 'perl guidance.pl') if align is True: self.parameters = \ [ # Required Parameters _Option(['--seqFile', 'seqFile'], "Input sequence file in FASTA format", filename=True, equate=False, is_required=True, checker_function=lambda x: str(Path(x).suffix) in ['.fasta', 'fna', '.ffn', '.faa', '.fra'] and Path(x).is_file()), _Option(['--msaProgram', 'msaProgram'], "Which MSA program to use", equate=False, is_required=True, checker_function=lambda x: x in ['MAFFT', 'PRANK', 'CLUSTALW', 'MUSCLE']), _Option(['--seqType', 'seqType'], "Type of sequences for alignment (amino acids, nucleotides, or codons)", equate=False, is_required=True, checker_function=lambda x: x in ['aa', 'nuc', 'codon']), _Option(['--outDir', 'outDir'], "Output directory that will be created " "automatically and hold all output files [please provid full (and not relative) path]", filename=True, equate=False, is_required=True), # Optional Parameters _Option(['--program', 'program'], "[GUIDANCE2|GUIDANCE|HoT] Default=GUIDANCE2", equate=False, checker_function=lambda x: x in ["GUIDANCE2", "GUIDANCE", "HoT"]), _Option(['--bootstraps', 'bootstraps'], "Number of bootstrap iterations (only for GUIDQANCE). Defaut=100", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(['--genCode', 'genCode'], "Genetic code identifier (only for codon sequences). Default=1 \ 1) Nuclear Standard\ 15) Nuclear Blepharisma\ 6) Nuclear Ciliate\ 10) Nuclear Euplotid\ 2) Mitochondria Vertebrate\ 5) Mitochondria Invertebrate\ 3) Mitochondria Yeast\ 13) Mitochondria Ascidian\ 9) Mitochondria Echinoderm\ 14) Mitochondria Flatworm\ 4) Mitochondria Protozoan" , equate=False, checker_function=lambda x: isinstance(x, int)), _Option(['--outOrder', 'outOrder'], "[aligned|as_input] default=aligned", equate=False, checker_function=lambda x: x in ['aligned', 'as_input']), _Option(['--msaFile', 'msaFile'], "Input alignment file - not recommended", filename=True, equate=False, checker_function=lambda x: Path(x).is_file()), # Confidence scores _Option(['--seqCutoff', 'seqCutoff'], "Confidence cutoff between 0 to 1. Default=0.6", equate=False, checker_function=lambda x: isinstance(x, (int, float))), _Option(['--colCutoff', 'colCutoff'], "Confidence cutoff between 0 to 1. Default=0.93", equate=False, checker_function=lambda x: isinstance(x, (int, float))), # Alignment Programs _Option(['--mafft', 'mafft'], "path to mafft executable. Default=mafft", filename=True, equate=False, checker_function=lambda x: Path(x).is_file()), _Option(['--prank', 'prank'], "path to prank executable. Default=prank", filename=True, equate=False, checker_function=lambda x: Path(x).is_file()), _Option(['--muscle', 'muscle'], "path to muscle executable. default=muscle", filename=True, equate=False, checker_function=lambda x: Path(x).is_file()), _Option(['--pagan', 'pagan'], "path to pagan executable, default=pagan", filename=True, equate=False, checker_function=lambda x: Path(x).is_file()), _Option(['--ruby', 'ruby'], "path to ruby executable. default=ruby", filename=True, equate=False, checker_function=lambda x: Path(x).is_file()), # Miscellaneous _Option(['--dataset', 'dataset'], "Unique name for the Dataset - will be used as prefix to outputs (default=MSA)", equate=False), _Option(['--MSA_Param', 'MSA_Param'], "passing parameters for the alignment program e.g -F to prank. " "To pass parameter containning '-' in it, add \ before each '-' e.g. \-F for PRANK", equate=False), _Option(['--proc_num', 'proc_num'], "number of processors to use (default=1)", equate=False, checker_function=lambda x: isinstance(x, int)) # Other Guidance scripts ] ACmd = AbstractCommandline.__init__(self, cmd, **kwargs) maskDir = ACmd.__getattribute__('outDir') if 'maskCutoff' in kwargs.keys(): if 'maskDir' in kwargs.keys(): maskDir = kwargs['maskDir'] os.chdir(maskDir) cmd = "maskLowScoreResidues" self.parameters = \ [ _Argument(['maskFile'], "Input alignment file for masking.", filename=True, is_required=True), _Argument(['rprScores'], "Residue pair Residue reliability scores.", filename=True, is_required=True), _Argument(['output'], "Absolute path of output file.", filename=True, is_required=True), _Argument(['maskCutoff'], "Confidence cutoff between 0 to 1.", filename=True, is_required=True), _Argument(['seqType'], "Type of sequences for alignment (amino acids or nucleotides)", is_required=True, checker_function=lambda x: x in ['aa', 'nuc']) ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd="needleall", **kwargs): """Initialize the class.""" self.parameters = [ _Option( ["-asequence", "asequence"], "First sequence to align", filename=True, is_required=True, ), _Option( ["-bsequence", "bsequence"], "Second sequence to align", filename=True, is_required=True, ), _Option(["-gapopen", "gapopen"], "Gap open penalty", is_required=True), _Option( ["-gapextend", "gapextend"], "Gap extension penalty", is_required=True ), _Option(["-datafile", "datafile"], "Matrix file", filename=True), _Option( ["-minscore", "minscore"], "Exclude alignments with scores below this threshold score.", ), _Option(["-errorfile", "errorfile"], "Error file to be written to."), _Option(["-endweight", "endweight"], "Apply And gap penalties"), _Option( ["-endopen", "endopen"], "The score taken away when an end gap is created.", ), _Option( ["-endextend", "endextend"], "The score added to the end gap penality for each base or " "residue in the end gap.", ), _Switch( ["-nobrief", "nobrief"], "Display extended identity and similarity" ), _Switch(["-brief", "brief"], "Display brief identity and similarity"), _Option( ["-similarity", "similarity"], "Display percent identity and similarity" ), _Option( ["-snucleotide", "snucleotide"], "Sequences are nucleotide (boolean)" ), _Option(["-sprotein", "sprotein"], "Sequences are protein (boolean)"), _Option( ["-aformat", "aformat"], "Display output in a different specified output format", ), ] _EmbossCommandLine.__init__(self, cmd, **kwargs)
def __init__(self, cmd="samtools", **kwargs): """Initialize the class.""" self.program_name = cmd self.parameters = [ _StaticArgument("view"), _Switch(["-b", "b"], "Output in the BAM format"), _Switch(["-c", "c"], """Instead of printing the alignments, only count them and print the total number. All filter options, such as '-f', '-F' and '-q', are taken into account"""), _Switch(["-h", "h"], "Include the header in the output"), _Switch(["-u", "u"], """Output uncompressed BAM. This option saves time spent on compression/decompression and is thus preferred when the output is piped to another samtools command"""), _Switch(["-H", "H"], "Output the header only"), _Switch(["-S", "S"], """Input is in SAM. If @SQ header lines are absent, the '-t' option is required."""), _Option(["-t", "t"], """This file is TAB-delimited. Each line must contain the reference name and the length of the reference, one line for each distinct reference; additional fields are ignored. This file also defines the order of the reference sequences in sorting. If you run 'samtools faidx <ref.fa>', the resultant index file <ref.fa>.fai can be used as this <in.ref_list> file.""", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-o", "o"], "Output file", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-f", "f"], """Only output alignments with all bits in INT present in the FLAG field""", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-F", "F"], "Skip alignments with bits present in INT", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-q", "q"], "Skip alignments with MAPQ smaller than INT", equate=False, checker_function=lambda x: isinstance(x, int)), _Option(["-r", "r"], "Only output reads in read group STR", equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-R", "R"], "Output reads in read groups listed in FILE", filename=True, equate=False, checker_function=lambda x: isinstance(x, str)), _Option(["-l", "l"], "Only output reads in library STR", equate=False, checker_function=lambda x: isinstance(x, str)), _Switch(["-1", "fast_bam"], "Use zlib compression level 1 to compress the output"), _Argument(["input", "input_file"], "Input File Name", filename=True, is_required=True), _Argument(["region"], "Region", is_required=False), ] AbstractCommandline.__init__(self, cmd, **kwargs)
def __init__(self, cmd='fasttree', **kwargs): self.parameters = [ _Switch(['-nt', 'nt'], "By default FastTree expects protein alignments, use -nt for nucleotides", ), _Option(['-n', 'n'], """-n -- read N multiple alignments in. This only works with phylip interleaved format. For example, you can use it with the output from phylip's seqboot. If you use -n, FastTree will write 1 tree per line to standard output. """, checker_function=_is_int, equate=False, ), _Switch(['-quote', 'quote'], """-quote -- add quotes to sequence names in output. Quote sequence names in the output and allow spaces, commas, parentheses, and colons in them but not ' characters (fasta files only). """, ), _Option(['-pseudo', 'pseudo'], """-pseudo [weight] -- Pseudocounts are used with sequence distance estimation. Use pseudocounts to estimate distances between sequences with little or no overlap. (Off by default.) Recommended if analyzing the alignment has sequences with little or no overlap. If the weight is not specified, it is 1.0 """, checker_function=_is_numeric, equate=False, ), _Option(['-boot', 'boot'], """Specify the number of resamples for support values. Support value options: By default, FastTree computes local support values by resampling the site likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, it will compute minimum-evolution bootstrap supports instead In either case, the support values are proportions ranging from 0 to 1 Use -nosupport to turn off support values or -boot 100 to use just 100 resamples. """, checker_function=_is_int, equate=False, ), _Switch(['-nosupport', 'nosupport'], """Turn off support values. Support value options: By default, FastTree computes local support values by resampling the site likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, it will compute minimum-evolution bootstrap supports instead In either case, the support values are proportions ranging from 0 to 1 Use -nosupport to turn off support values or -boot 100 to use just 100 resamples. """, ), _Option(['-intree', 'intree'], """-intree newickfile -- read the starting tree in from newickfile. Any branch lengths in the starting trees are ignored. -intree with -n will read a separate starting tree for each alignment. """, filename=True, equate=False, ), _Option(['-intree1', 'intree1'], "intree1 newickfile -- read the same starting tree for each alignment.", filename=True, equate=False, ), _Switch(['-quiet', 'quiet'], """-quiet -- do not write to standard error during normal operation (no progress indicator, no options summary, no likelihood values, etc.) """, ), _Switch(['-nopr', 'nopr'], "-nopr -- do not write the progress indicator to stderr.", ), _Option(['-nni', 'nni'], """Set the rounds of minimum-evolution nearest-neighbor interchanges Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs. """, checker_function=_is_int, equate=False, ), _Option(['-spr', 'spr'], """Set the rounds of subtree-prune-regraft moves Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. """, checker_function=_is_int, equate=False, ), _Switch(['-noml', 'noml'], """Deactivate min-evo NNIs and SPRs. Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. Use -noml to turn off both min-evo NNIs and SPRs (useful if refining an approximately maximum-likelihood tree with further NNIs). """, ), _Switch(['-mllen', 'mllen'], """Optimize branch lengths on a fixed topology. Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. Use -mllen to optimize branch lengths without ML NNIs Use -mllen -nome with -intree to optimize branch lengths on a fixed topology. """, ), _Switch(['-nome', 'nome'], """Changes support values calculation to a minimum-evolution bootstrap method. Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. Use -mllen to optimize branch lengths without ML NNIs Use -mllen -nome with -intree to optimize branch lengths on a fixed topology Support value options: By default, FastTree computes local support values by resampling the site likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, it will compute minimum-evolution bootstrap supports instead In either case, the support values are proportions ranging from 0 to 1. """, ), _Option(['-mlnni', 'mlnni'], """Set the number of rounds of maximum-likelihood NNIs. Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. Use -mlnni to set the number of rounds of maximum-likelihood NNIs. """, checker_function=_is_int, equate=False, ), _Option(['-mlacc', 'mlacc'], """Option for optimization of branches at each NNI. Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI, and to optimize all 5 branches in 2 or 3 rounds. """, checker_function=_is_int, equate=False, ), _Switch(['-slownni', 'slownni'], """Turn off heuristics to avoid constant subtrees with NNIs. Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. Use -slownni to turn off heuristics to avoid constant subtrees (affects both ML and ME NNIs). """, ), _Switch(['-wag', 'wag'], """Maximum likelihood model options. Whelan-And-Goldman 2001 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only) """, ), _Switch(['-gtr', 'gtr'], """Maximum likelihood model options. Use generalized time-reversible instead of (default) Jukes-Cantor (nt only) """, ), _Option(['-cat', 'cat'], """Maximum likelihood model options. Specify the number of rate categories of sites (default 20).""", checker_function=_is_int, equate=False, ), _Switch(['-nocat', 'nocat'], "Maximum likelihood model options: No CAT model (just 1 category)", ), _Switch(['-gamma', 'gamma'], """Report the likelihood under the discrete gamma model. Maximum likelihood model options: -gamma -- after the final round of optimizing branch lengths with the CAT model, report the likelihood under the discrete gamma model with the same number of categories. FastTree uses the same branch lengths but optimizes the gamma shape parameter and the scale of the lengths. The final tree will have rescaled lengths. Used with -log, this also generates per-site likelihoods for use with CONSEL, see GammaLogToPaup.pl and documentation on the FastTree web site. """, ), _Switch(['-slow', 'slow'], """Use an exhaustive search. Searching for the best join: By default, FastTree combines the 'visible set' of fast neighbor-joining with local hill-climbing as in relaxed neighbor-joining -slow -- exhaustive search (like NJ or BIONJ, but different gap handling) -slow takes half an hour instead of 8 seconds for 1,250 proteins """, ), _Switch(['-fastest', 'fastest'], """Search the visible set (the top hit for each node) only. Searching for the best join: By default, FastTree combines the 'visible set' of fast neighbor-joining with local hill-climbing as in relaxed neighbor-joining -fastest -- search the visible set (the top hit for each node) only Unlike the original fast neighbor-joining, -fastest updates visible(C) after joining A and B if join(AB,C) is better than join(C,visible(C)) -fastest also updates out-distances in a very lazy way, -fastest sets -2nd on as well, use -fastest -no2nd to avoid this """, ), _Switch(['-2nd', 'second'], """Turn 2nd-level top hits heuristic on. Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search Use -notop (or -slow) to turn this feature off and compare all leaves to each other, and all new joined nodes to each other -2nd or -no2nd to turn 2nd-level top hits heuristic on or off This reduces memory usage and running time but may lead to marginal reductions in tree quality. (By default, -fastest turns on -2nd.) """, ), _Switch(['-no2nd', 'no2nd'], """Turn 2nd-level top hits heuristic off. Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search Use -notop (or -slow) to turn this feature off and compare all leaves to each other, and all new joined nodes to each other -2nd or -no2nd to turn 2nd-level top hits heuristic on or off This reduces memory usage and running time but may lead to marginal reductions in tree quality. (By default, -fastest turns on -2nd.) """, ), _Option(['-seed', 'seed'], """Use -seed to initialize the random number generator. Support value options: By default, FastTree computes local support values by resampling the site likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, it will compute minimum-evolution bootstrap supports instead In either case, the support values are proportions ranging from 0 to 1. """, checker_function=_is_int, equate=False, ), _Switch(['-top', 'top'], """Top-hit list to speed up search Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search Use -notop (or -slow) to turn this feature off and compare all leaves to each other, and all new joined nodes to each other. """, ), _Switch(['-notop', 'notop'], """Turn off top-hit list to speed up search Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search Use -notop (or -slow) to turn this feature off and compare all leaves to each other, and all new joined nodes to each other. """, ), _Option(['-topm', 'topm'], """Change the top hits calculation method Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search -topm 1.0 -- set the top-hit list size to parameter*sqrt(N) FastTree estimates the top m hits of a leaf from the top 2*m hits of a 'close' neighbor, where close is defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m), and updates the top-hits as joins proceed. """, checker_function=_is_numeric, equate=False, ), _Option(['-close', 'close'], """Modify the close heuristic for the top-hit list Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search -close 0.75 -- modify the close heuristic, lower is more conservative. """, checker_function=_is_numeric, equate=False, ), _Option(['-refresh', 'refresh'], """Parameter for conditions that joined nodes are compared to other nodes Top-hit heuristics: By default, FastTree uses a top-hit list to speed up search -refresh 0.8 -- compare a joined node to all other nodes if its top-hit list is less than 80% of the desired length, or if the age of the top-hit list is log2(m) or greater. """, checker_function=_is_numeric, equate=False, ), _Option(['-matrix', 'matrix'], """Specify a matrix for nucleotide or amino acid distances Distances: Default: For protein sequences, log-corrected distances and an amino acid dissimilarity matrix derived from BLOSUM45 or for nucleotide sequences, Jukes-Cantor distances To specify a different matrix, use -matrix FilePrefix or -nomatrix """, filename=True, equate=False, ), _Switch(['-nomatrix', 'nomatrix'], """Specify that no matrix should be used for nucleotide or amino acid distances Distances: Default: For protein sequences, log-corrected distances and an amino acid dissimilarity matrix derived from BLOSUM45 or for nucleotide sequences, Jukes-Cantor distances To specify a different matrix, use -matrix FilePrefix or -nomatrix """, ), _Switch(['-nj', 'nj'], "Join options: regular (unweighted) neighbor-joining (default)", ), _Switch(['-bionj', 'bionj'], """Join options: weighted joins as in BIONJ. FastTree will also weight joins during NNIs. """, ), _Option(['-gtrrates', 'gtrrates'], "-gtrrates ac ag at cg ct gt", equate=False, ), _Option(['-gtrfreq', 'gtrfreq'], "-gtrfreq A C G T", equate=False, ), _Option(['-constraints', 'constraints'], """Specifies an alignment file for use with constrained topology searching Constrained topology search options: -constraints alignmentfile -- an alignment with values of 0, 1, and - Not all sequences need be present. A column of 0s and 1s defines a constrained split. Some constraints may be violated (see 'violating constraints:' in standard error). """, filename=True, equate=False, ), _Option(['-constraintWeight', 'constraintWeight'], """Weight strength of contraints in topology searching. Constrained topology search options: -constraintWeight -- how strongly to weight the constraints. A value of 1 means a penalty of 1 in tree length for violating a constraint Default: 100.0 """, checker_function=_is_numeric, equate=False, ), _Option(['-log', 'log'], """Create log files of data such as intermediate trees and per-site rates -log logfile -- save intermediate trees so you can extract the trees and restart long-running jobs if they crash -log also reports the per-site rates (1 means slowest category). """, filename=True, equate=False, ), _Option(['-makematrix', 'makematrix'], "-makematrix [alignment]", filename=True, equate=False, ), _Switch(['-rawdist', 'rawdist'], """Turn off or adjust log-correction in AA or NT distances. Use -rawdist to turn the log-correction off or to use %different instead of Jukes-Cantor in AA or NT distances Distances: Default: For protein sequences, log-corrected distances and an amino acid dissimilarity matrix derived from BLOSUM45 or for nucleotide sequences, Jukes-Cantor distances To specify a different matrix, use -matrix FilePrefix or -nomatrix """, ), _Option(['-sprlength', 'sprlength'], """Set maximum SPR move length in topology refinement (default 10). Topology refinement: By default, FastTree tries to improve the tree with up to 4*log2(N) rounds of minimum-evolution nearest-neighbor interchanges (NNI), where N is the number of unique sequences, 2 rounds of subtree-prune-regraft (SPR) moves (also min. evo.), and up to 2*log(N) rounds of maximum-likelihood NNIs. Use -nni to set the number of rounds of min. evo. NNIs, and -spr to set the rounds of SPRs. """, checker_function=_is_int, equate=False, ), _Switch(['-help', 'help'], "Show the help."), _Switch(['-expert', 'expert'], "Show the expert level help."), _Option(['-out', 'out'], """Enter <output file> The path to a Newick Tree output file needs to be specified. """, filename=True, equate=False, ), _Argument(['input'], """Enter <input file> An input file of sequence alignments in fasta or phylip format is needed. By default FastTree expects protein alignments, use -nt for nucleotides. """, filename=True, is_required=True, ), ] AbstractCommandline.__init__(self, cmd, **kwargs)