def handle(self, *args, **options): tool_name = 'GeneMark-ES' tool_version = '2.3e' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://topaz.gatech.edu', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run GeneMark-ES', \ exec_path = tool_settings['gm_es_bin'] ) command_bp.save() command_bp.parents.add( flow_bp ) CommandBlueprintParam( command=command_bp, name='--max_nnn', prefix='--max_nnn ', position=1, default_value='49', \ short_desc='[number] number of unknown letters used to fill gaps').save() CommandBlueprintParam( command=command_bp, name='--min_contig', prefix='--min_contig ', position=2, default_value='20000', \ short_desc='[number] minimum length of the contig', \ long_desc='All contigs shorter then "min_contig" are excluded from training procedure.').save() CommandBlueprintParam( command=command_bp, name='--max_contig', prefix='--max_contig ', position=3, default_value='10000000', \ short_desc='[number] maximum length of contig', \ long_desc='Sequences longer then "max_contig" are split into shorter ones to avoid out of memory condition.').save() CommandBlueprintParam( command=command_bp, name='--BP', prefix='--BP ', position=4, \ short_desc='switches off the branch point submodel and runs original ES algorithm (ON/OFF)').save() CommandBlueprintParam( command=command_bp, name='--ini_mod', prefix='--ini_mod ', position=5, \ short_desc='').save() CommandBlueprintParam( command=command_bp, name='<sequence_file_name>', prefix=None, position=6, is_optional=False, \ short_desc='input sequence in FASTA format').save() tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='<sequence_file_name>' ) tool.can_create( filetype_name='GFF3', via_command=command_bp )
def handle(self, *args, **options): # remember that these need to match the [geneid 1.4] in settings.ini tool_name = 'geneid' tool_version = '1.4' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s', name=tool_name ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://www1.imim.es/software/geneid/index.html', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run geneid', \ exec_path = tool_settings['exec_path'] ) command_bp.save() command_bp.parents.add( flow_bp ) ########################################################### ## Now you define the parameters for the tool # There are an enormous number of options, and not all are implemented here. Even # so, it should work for most applications. # geneid -3 -P <parameter_filename> <Sequence_filename> > some.out.gff3 CommandBlueprintParam( command=command_bp, name='-3', prefix='-3 ', position=1, has_no_value=True, \ is_optional=False, short_desc='Use GFF3 format to print predictions' ).save() CommandBlueprintParam( command=command_bp, name='-P', prefix='-P ', position=2, \ is_optional=False, short_desc='Parameter file to use' ).save() ## this has to be the last argument positionally CommandBlueprintParam( command=command_bp, name='<sequence_filename>', prefix=None, position=3, \ is_optional=False, short_desc='Input query FASTA file' ).save() ########################################################### ## Now define the input/ouput of the tool tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='<sequence_filename>' )
def handle(self, *args, **options): tool_name = 'NUCmer genome coverage' tool_version = '1.0' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format('MUMmer', '3.23') ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() nucmer_genome_cov = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='?', \ flow_bp=flow_bp ) nucmer_genome_cov.save() ######## # nucmer nucmer = StandaloneTool.objects.get( name='NUCmer', version='3.23' ) nucmer.flow_bp.parents.add( flow_bp ) run_nucmer_command = nucmer.flow_bp.get_command('Run NUCmer') FlowInstanceValue( flow=nucmer.flow_bp, command=run_nucmer_command, \ command_param=run_nucmer_command.get_param('-b'), value='190' ).save() FlowInstanceValue( flow=nucmer.flow_bp, command=run_nucmer_command, \ command_param=run_nucmer_command.get_param('-p'), value='/tmp/test_nucmer_pipeline' ).save() ############# # show-coords with -l -r -T options show_coords = StandaloneTool.objects.get( name='show-coords', version='3.23' ) show_coords.flow_bp.parents.add( flow_bp ) show_coords_command = show_coords.flow_bp.get_command('Run show-coords') FlowInstanceValue( flow=show_coords.flow_bp, command=show_coords_command, \ command_param=show_coords_command.get_param('<deltafile>'), \ value='/tmp/test_nucmer_pipeline.delta' ).save() FlowInstanceValue( flow=show_coords.flow_bp, command=show_coords_command, \ command_param=show_coords_command.get_param('-l') ).save() FlowInstanceValue( flow=show_coords.flow_bp, command=show_coords_command, \ command_param=show_coords_command.get_param('-r') ).save() FlowInstanceValue( flow=show_coords.flow_bp, command=show_coords_command, \ command_param=show_coords_command.get_param('-T') ).save()
def handle(self, *args, **options): tool_name = 'EVM' tool_version = 'r20120625' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://sourceforge.net/projects/evidencemodeler', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run EVM', \ exec_path = tool_settings['evm_bin'] ) command_bp.save() command_bp.parents.add( flow_bp ) CommandBlueprintParam( command=command_bp, name='--genome', prefix='--genome ', position=1, is_optional=False, \ short_desc='Genome sequence in FASTA format').save() CommandBlueprintParam( command=command_bp, name='--weights', prefix='--weights ', position=2, is_optional=False, \ short_desc='Weights for evidence types file', \ long_desc='The weights file is used to describe and score the types of each input. More info here: http://evidencemodeler.sourceforge.net/#Preparing_inputs').save() CommandBlueprintParam( command=command_bp, name='--gene_predictions', prefix='--gene_predictions ', position=3, is_optional=False, \ short_desc='Gene predictions GFF3 file', \ long_desc='All input files which correspond to gene predictions (and not protein or transcript alignments) should be concatenated into this file.').save() CommandBlueprintParam( command=command_bp, name='--protein_alignments', prefix='--protein_alignments ', position=4, is_optional=True, \ short_desc='Protein alignments in GFF3 format').save() CommandBlueprintParam( command=command_bp, name='--transcript_alignments', prefix='--transcript_alignments ', position=5, is_optional=True, \ short_desc='Transcript alignments in GFF3 format').save() tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='--genome' ) tool.needs( filetype_name='GFF3', via_command=command_bp, via_param='--gene_predictions' )
def handle(self, *args, **options): tool_name = 'Bowtie' tool_version = '1.0.0' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s', \ description='Bowtie is an ultrafast, memory-efficient short read aligner. It aligns short DNA sequences (reads) to the human genome at a rate of over 25 million 35-bp reads per hour. Bowtie indexes the genome with a Burrows-Wheeler index to keep its memory footprint small: typically about 2.2 GB for the human genome (2.9 GB for paired-end).') flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://bowtie-bio.sourceforge.net/index.shtml', \ flow_bp=flow_bp ) tool.save()
def handle(self, *args, **options): tool_name = 'Trinity' tool_version = 'r2013-02-25' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://trinityrnaseq.sourceforge.net/', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run Trinity', \ exec_path = tool_settings['exec_path'] ) command_bp.save() command_bp.parents.add( flow_bp ) CommandBlueprintParam( command=command_bp, name='--seqType', prefix='--seqType ', position=1, \ is_optional=False, short_desc='Type of reads: (cfa, cfq, fa, or fq)' ).save() CommandBlueprintParam( command=command_bp, name='--JM', prefix='--JM ', position=2, \ is_optional=False, short_desc='Number of GB of system memory to use for k-mer counting by jellyfish (eg. 10G). Include the G character.' ).save() CommandBlueprintParam( command=command_bp, name='--left', prefix='--left ', position=3, \ short_desc='Left reads' ).save() CommandBlueprintParam( command=command_bp, name='--right', prefix='--right ', position=4, \ short_desc='Right reads' ).save() CommandBlueprintParam( command=command_bp, name='--single', prefix='--single ', position=5, \ short_desc='Single (unpaired) reads' ).save() CommandBlueprintParam( command=command_bp, name='--SS_lib_type', prefix='--SS_lib_type ', position=6, \ short_desc='Strand-specific RNA-Seq read orientation. if paired: RF or FR, if single: F or R. (dUTP method = RF)' ).save() CommandBlueprintParam( command=command_bp, name='--output', prefix='--output ', position=7, \ short_desc='Name of directory for output (will be created if doesn\'t already exist.', \ default_value='trinity_out_dir' ).save() CommandBlueprintParam( command=command_bp, name='--CPU', prefix='--CPU ', position=8, \ short_desc='Number of CPUs to use', default_value='2' ).save() CommandBlueprintParam( command=command_bp, name='--min_contig_length', prefix='--min_contig_length ', \ position=9, short_desc='Minimum assembled contig length to report', default_value='200' ).save() CommandBlueprintParam( command=command_bp, name='--jaccard_clip', prefix='--jaccard_clip ', position=10, \ has_no_value=True, short_desc='Set if you have paired reads and expect high gene density with UTR overlap. This is an expensive operation.' ).save() CommandBlueprintParam( command=command_bp, name='--no_cleanup', prefix='--no_cleanup ', position=11, \ has_no_value=True, short_desc='Retain all intermediate input files' ).save() #################################################### # Inchworm and K-mer counting-related options: ##### CommandBlueprintParam( command=command_bp, name='--min_kmer_cov', prefix='--min_kmer_cov ', position=12, \ short_desc='Min count for K-mers to be assembled by Inchworm', default_value='1' ).save() ## Should later add the --no_run_quantifygraph option and process the rest via an iterator ##################################### ### Butterfly-related options: #### CommandBlueprintParam( command=command_bp, name='--max_number_of_paths_per_node', prefix='--max_number_of_paths_per_node ', \ position=13, short_desc='Only most supported (N) paths are extended from node A->B, mitigating combinatoric path explorations', \ default_value='10' ).save() CommandBlueprintParam( command=command_bp, name='--group_pairs_distance', prefix='--group_pairs_distance ', \ position=14, short_desc='Maximum length expected between fragment pairs. Reads outside this will be treated as single-end', \ default_value='500' ).save() CommandBlueprintParam( command=command_bp, name='--path_reinforcement_distance', prefix='--path_reinforcement_distance ', \ position=15, short_desc='Minimum overlap of reads with growing transcript path (default: PE: 75, SE: 25)' ).save() CommandBlueprintParam( command=command_bp, name='--no_triplet_lock', prefix='--no_triplet_lock ', position=16, \ has_no_value=True, short_desc='Do not lock triplet-supported nodes' ).save() CommandBlueprintParam( command=command_bp, name='--bflyHeapSpaceMax', prefix='--bflyHeapSpaceMax ', position=17, \ default_value='20G', short_desc='Java max heap space setting for butterfly' ).save() CommandBlueprintParam( command=command_bp, name='--bflyHeapSpaceInit', prefix='--bflyHeapSpaceInit ', position=18, \ default_value='1G', short_desc='Java initial heap space settings for butterfly' ).save() CommandBlueprintParam( command=command_bp, name='--bflyGCThreads', prefix='--bflyGCThreads ', position=19, \ short_desc='Threads for garbage collection' ).save() CommandBlueprintParam( command=command_bp, name='--bflyCPU', prefix='--bflyCPU ', position=20, \ short_desc='CPUs to use. Default will match --CPU value' ).save() CommandBlueprintParam( command=command_bp, name='--bflyCalculateCPU', prefix='--bflyCalculateCPU ', position=21, \ short_desc='Calculate CPUs based on 805 of max_memory divided by bflyHeapSpaceMax' ).save() # TODO: needs improving. Unfortunately, Trinity currently only supports output definition # at the directory level, and the file names under that are created by convention. # I've written Brian to see if I can add this tool.creates( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='--output' ) # TODO: parameter grouping needs to be applied here. tool.can_use( filetype_name='FASTQ (Sanger, paired reads, left)', via_command=command_bp, via_param='--left' ) tool.can_use( filetype_name='FASTQ (Sanger, paired reads, right)', via_command=command_bp, via_param='--right' ) tool.can_use( filetype_name='FASTQ (Sanger, unpaired reads)', via_command=command_bp, via_param='--single' ) # TODO: parameter grouping needs to be applied here. tool.can_use( filetype_name='FASTA (paired reads, left)', via_command=command_bp, via_param='--left' ) tool.can_use( filetype_name='FASTA (paired reads, right)', via_command=command_bp, via_param='--right' ) tool.can_use( filetype_name='FASTA (unpaired reads)', via_command=command_bp, via_param='--single' )
def handle(self, *args, **options): tool_name = 'NUCmer' tool_version = '3.23' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format('MUMmer', tool_version) ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://mummer.sourceforge.net/manual/#nucmer', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run NUCmer', \ exec_path = tool_settings['nucmer_bin'] ) command_bp.save() command_bp.parents.add(flow_bp) # USAGE: nucmer [options] <Reference> <Query> CommandBlueprintParam( command=command_bp, name='STDOUT', position=0 ).save(); CommandBlueprintParam( command=command_bp, name='STDERR', position=0 ).save(); CommandBlueprintParam( command=command_bp, name='--mum', prefix='--mum ', has_no_value=True, position=1, \ short_desc='Use anchor matches that are unique in both the reference and query' ).save() CommandBlueprintParam( command=command_bp, name='--mumreference', prefix='--mumreference ', has_no_value=True, position=2, \ short_desc='Use anchor matches that are unique in the reference but not necessarily unique in the query' ).save() CommandBlueprintParam( command=command_bp, name='-b', prefix='-b ', position=3, default_value='200', \ short_desc='Alignment extension distance', \ long_desc='Distance an alignment extension will attempt to extend poor scoring regions before giving up').save() CommandBlueprintParam( command=command_bp, name='-c', prefix='-c ', default_value='65', position=4, \ short_desc='Minimum length of a cluster of matches' ).save() CommandBlueprintParam( command=command_bp, name='--nodelta', prefix='--nodelta ', has_no_value=True, position=5, \ short_desc='Toggles off creation of delta file' ).save() CommandBlueprintParam( command=command_bp, name='-D', prefix='-D ', default_value='5', position=6, \ short_desc='Maximum diagonal difference between two adjacent anchors in a cluster' ).save() CommandBlueprintParam( command=command_bp, name='-d', prefix='-d ', default_value='0.12', position=7, \ short_desc='Maximum diagonal difference ratio', \ long_desc='Maximum diagonal difference between two adjacent anchors in a cluster as a differential fraction of the gap length ' ).save() CommandBlueprintParam( command=command_bp, name='--noextend', prefix='--noextend ', has_no_value=True, position=8, \ short_desc='Toggles off the cluster extension step' ).save() CommandBlueprintParam( command=command_bp, name='--forward', prefix='--forward ', has_no_value=True, position=9, \ short_desc='Use only the forward strand of the Query sequences' ).save() CommandBlueprintParam( command=command_bp, name='-g', prefix='-g ', default_value='90', position=10, \ short_desc='Maximum gap between two adjacent matches in a cluster' ).save() CommandBlueprintParam( command=command_bp, name='-l', prefix='-l ', default_value='20', position=11, \ short_desc='Minimum length of a single match' ).save() CommandBlueprintParam( command=command_bp, name='--nooptimize', prefix='--nooptimize ', has_no_value=True, position=12, \ short_desc='Toggle off alignment score optimization', \ long_desc='Toggles off alignment score optimization, i.e. if an alignment extension reaches the end of a sequence, it will backtrack to optimize the alignment score instead of terminating the alignment at the end of the sequence').save() # we make this one required just so that a tool can always look up the output file more easily CommandBlueprintParam( command=command_bp, name='-p', prefix='-p ', default_value='out', position=17, is_optional=False, \ short_desc='Sets the output file prefix, which can include the directory path' ).save() CommandBlueprintParam( command=command_bp, name='--reverse', prefix='--reverse ', has_no_value=True, position=13, \ short_desc='Use only the reverse complement of the Query sequences' ).save() CommandBlueprintParam( command=command_bp, name='--nosimplify', prefix='--nosimplify ', has_no_value=True, position=14, \ short_desc='Removes shadowed clusters', \ long_desc='Simplify alignments by removing shadowed clusters. Turn this option off if aligning a sequence to itself to look for repeats' ).save() CommandBlueprintParam( command=command_bp, name='<reference_in>', prefix=None, position=15, is_optional=False, \ short_desc='Input reference FASTA file' ).save() CommandBlueprintParam( command=command_bp, name='<query_in>', prefix=None, position=16, is_optional=False, \ short_desc='Input query FASTA file' ).save() tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='<reference_in>' ) tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='<query_in>' ) tool.creates( filetype_name='MUMmer delta file', via_command=command_bp, via_param='STDOUT' )
def handle(self, *args, **options): tool_name = 'show-coords' tool_version = '3.23' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format('MUMmer', tool_version) ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://mummer.sourceforge.net/manual/#coords', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run show-coords', \ exec_path = tool_settings['show_coords_bin'] ) command_bp.save() command_bp.parents.add(flow_bp) # USAGE: show-coords [options] <deltafile> CommandBlueprintParam( command=command_bp, name='-b', prefix='-b ', has_no_value=True, position=1, \ short_desc='Merges overlapping alignments', \ long_desc='Merges overlapping alignments regardless of match dir or frame and does not display any idenitity information.' ).save() CommandBlueprintParam( command=command_bp, name='-B', prefix='-B ', has_no_value=True, position=2, \ short_desc='Switch output to btab format' ).save() CommandBlueprintParam( command=command_bp, name='-c', prefix='-c ', has_no_value=True, position=3, \ short_desc='Include percent coverage information in the output' ).save() CommandBlueprintParam( command=command_bp, name='-d', prefix='-d ', has_no_value=True, position=4, \ short_desc='Display the alignment direction in the additional FRM columns (default for promer)' ).save() CommandBlueprintParam( command=command_bp, name='-H', prefix='-H ', has_no_value=True, position=5, \ short_desc='Do not print the output header' ).save() CommandBlueprintParam( command=command_bp, name='-I', prefix='-I ', position=6, \ short_desc='Set minimum percent identity to display' ).save() CommandBlueprintParam( command=command_bp, name='-k', prefix='-k ', has_no_value=True, position=7, \ short_desc='Knockout 50/75 alignments', \ long_desc='Knockout (do not display) alignments that overlap another alignment in a different frame by more than 50% of their length, AND have a smaller percent similarity or are less than 75% of the size of the other alignment (promer only)' ).save() CommandBlueprintParam( command=command_bp, name='-l', prefix='-l ', has_no_value=True, position=8, \ short_desc='Include the sequence length information in the output' ).save() CommandBlueprintParam( command=command_bp, name='-L', prefix='-L ', position=9, \ short_desc='Set minimum alignment length to display' ).save() CommandBlueprintParam( command=command_bp, name='-o', prefix='-o ', has_no_value=True, position=10, \ short_desc='Annotate maximal alignments between two sequences', \ long_desc='Annotate maximal alignments between two sequences, i.e. overlaps between reference and query sequences').save() CommandBlueprintParam( command=command_bp, name='-q', prefix='-q ', has_no_value=True, position=11, \ short_desc='Sort output lines by query IDs and coordinates' ).save() CommandBlueprintParam( command=command_bp, name='-r', prefix='-r ', has_no_value=True, position=12, \ short_desc='Sort output lines by reference IDs and coordinates' ).save() CommandBlueprintParam( command=command_bp, name='-T', prefix='-T ', has_no_value=True, position=13, \ short_desc='Switch output to tab-delimited format' ).save() CommandBlueprintParam( command=command_bp, name='<deltafile>', prefix=None, position=14, is_optional=False, \ short_desc='Input reference FASTA file' ).save() tool.needs( filetype_name='MUMmer delta file', via_command=command_bp, via_param='<deltafile>' )
def handle(self, *args, **options): tool_name = 'Trinity in silico read normalization' tool_version = 'r2013-02-25' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format('Trinity', tool_version) ] flow_bp = FlowBlueprint( type='s', \ description='Large RNA-Seq data sets, such as those exceeding 300M pairs, are best suited for in silico normalization prior to running Trinity, in order to reduce memory requirements and greatly improve upon runtimes. Before running the normalization, be sure that in the case of paired reads, the left read names end with suffix /1 and the right read names end with /2') flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://trinityrnaseq.sourceforge.net/trinity_insilico_normalization.html', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run Trinity read normalization', \ exec_path = tool_settings['normalization_script'] ) command_bp.save() command_bp.parents.add( flow_bp ) CommandBlueprintParam( command=command_bp, name='--seqType', prefix='--seqType ', position=1, \ is_optional=False, short_desc='Type of reads: (fa, or fq)' ).save() CommandBlueprintParam( command=command_bp, name='--JM', prefix='--JM ', position=2, \ is_optional=False, short_desc='Number of GB of system memory to use for k-mer counting by jellyfish (eg. 10G). Include the G character.' ).save() CommandBlueprintParam( command=command_bp, name='--left', prefix='--left ', position=3, \ short_desc='Left reads' ).save() CommandBlueprintParam( command=command_bp, name='--right', prefix='--right ', position=4, \ short_desc='Right reads' ).save() CommandBlueprintParam( command=command_bp, name='--single', prefix='--single ', position=5, \ short_desc='Single (unpaired) reads' ).save() CommandBlueprintParam( command=command_bp, name='--left_list', prefix='--left_list ', position=3, \ short_desc='Left reads, if using a list file. One file path per line', \ long_desc='If you have read collections in different files you can use list files, where each line in a list file is the full path to an input file. This saves you the time of combining them just so you can pass a single file for each direction.').save() CommandBlueprintParam( command=command_bp, name='--right_list', prefix='--right_list ', position=4, \ short_desc='Right reads, if using a list file. One file path per line', \ long_desc='If you have read collections in different files you can use list files, where each line in a list file is the full path to an input file. This saves you the time of combining them just so you can pass a single file for each direction.').save() CommandBlueprintParam( command=command_bp, name='--pairs_together', prefix='--pairs_together ', position=6, \ has_no_value=True, short_desc='Process paired reads by averaging stats between pairs and retaining linking info' ).save() CommandBlueprintParam( command=command_bp, name='--SS_lib_type', prefix='--SS_lib_type ', position=7, \ short_desc='Strand-specific RNA-Seq read orientation. if paired: RF or FR, if single: F or R. (dUTP method = RF)' ).save() CommandBlueprintParam( command=command_bp, name='--output', prefix='--output ', position=8, \ short_desc='Name of directory for output (will be created if doesn\'t already exist.', \ default_value='normalized_reads' ).save() CommandBlueprintParam( command=command_bp, name='--JELLY_CPU', prefix='--JELLY_CPU ', position=9, \ short_desc='Number of threads for Jellyfish to use', default_value='2' ).save() CommandBlueprintParam( command=command_bp, name='--PARALLEL_STATS', prefix='--PARALLEL_STATS ', position=10, \ has_no_value=True, short_desc='Generate read stats in parallel for paired reads (Figure 2X Inchworm memory requirement)' ).save() CommandBlueprintParam( command=command_bp, name='--KMER_SIZE', prefix='--KMER_SIZE ', position=11, \ short_desc='K-mer size for de Bruijn graph construction', default_value='25' ).save() CommandBlueprintParam( command=command_bp, name='--min_kmer_cov', prefix='--min_kmer_cov ', \ position=12, short_desc='Minimum kmer coverage for catalog construction', default_value='1' ).save() CommandBlueprintParam( command=command_bp, name='--max_pct_stdev', prefix='--max_pct_stdev ', position=13, \ short_desc='Maximum pct of mean for stdev of kmer coverage across read', default_value='100' ).save() # TODO: parameter grouping needs to be applied here. tool.can_use( filetype_name='FASTQ (Sanger, paired reads, left)', via_command=command_bp, via_param='--left' ) tool.can_use( filetype_name='FASTQ (Sanger, paired reads, right)', via_command=command_bp, via_param='--right' ) tool.can_use( filetype_name='FASTQ (Sanger, unpaired reads)', via_command=command_bp, via_param='--single' ) # TODO: parameter grouping needs to be applied here. # TODO: needs improving. Unfortunately, Trinity currently only supports output definition # at the directory level, and the file names under that are created by convention. # I've written Brian to see if I can add this tool.can_create( filetype_name='FASTQ (Sanger, paired reads, left)', via_command=command_bp, via_param='--output' ) tool.can_create( filetype_name='FASTQ (Sanger, paired reads, right)', via_command=command_bp, via_param='--output' ) tool.can_create( filetype_name='FASTQ (Sanger, unpaired reads)', via_command=command_bp, via_param='--output' )
def handle(self, *args, **options): tool_name = 'GeneMarkS' tool_version = '4.6b' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s' ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://topaz.gatech.edu', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run GeneMarkS', \ exec_path = tool_settings['gm_s_bin'] ) command_bp.save() command_bp.parents.add(flow_bp) CommandBlueprintParam( command=command_bp, name='--name', prefix='--name ', position=1,default_value='GeneMark_hmm.mod', \ short_desc='<string> name of output model file generated for GeneMark.hmm').save() CommandBlueprintParam( command=command_bp, name='--combine', prefix='--combine ', position=2, default_value='GeneMark_hmm_combined.mod' ,\ short_desc='combine GeneMarkS generated and Heuristic model parameters into one integrated model').save() CommandBlueprintParam( command=command_bp, name='--gm', prefix='--gm ', position=3, default_value='GeneMark.mat', \ short_desc='generate model file for GeneMark').save() CommandBlueprintParam( command=command_bp, name='--species', prefix='--species ', position=4, \ short_desc='<string> name of a species in a model file' ).save() CommandBlueprintParam( command=command_bp, name='--clean', prefix='--clean ', position=5, has_no_value=True, \ short_desc='delete all temporary files').save() CommandBlueprintParam( command=command_bp, name='--order', prefix='--order ', position=6, default_value='2', \ short_desc='<number> markov chain order. (default: 2; supported in range: >= 0)' ).save() CommandBlueprintParam( command=command_bp, name='--gcode', prefix='--gcode ', position=7, default_value='11', \ short_desc='<number> genetic code. default: 11; supported: 11, 4 and 1)' ).save() CommandBlueprintParam( command=command_bp, name='--shape', prefix='--shape ', position=8, default_value='partial', \ short_desc='<string> sequence organization (default: partial; supported: linear, circular and partial)' ).save() CommandBlueprintParam( command=command_bp, name='--motif', prefix='--motif ', position=9, default_value='1', \ long_desc='<number> iterative search for a sequence motif associated with CDS start. (default: 1; supported: 1 <true> and 0 <false>)' ).save() CommandBlueprintParam( command=command_bp, name='--width', prefix='--width ', position=10, default_value='6' , \ short_desc='<number> motif width (default: 6; supported in range: >= 3)' ).save() CommandBlueprintParam( command=command_bp, name='--prestart', prefix='--prestart ', position=11, default_value='26', \ long_desc='<number> length of sequence upstream of translation initiation site that presumably includes the motif (default: 26; supported in range: >= 0)' ).save() CommandBlueprintParam( command=command_bp, name='--identity', prefix='--identity ', position=12, default_value='0.99', \ long_desc='<number> identity level assigned for termination of iterations (default: 0.99; supported in range: >=0 and <= 1)' ).save() CommandBlueprintParam( command=command_bp, name='--matrix', prefix='--matrix ', position=13, default_value='10', \ short_desc='<number> maximum number of iterations (default: 10; supported in range: >= 1)').save() CommandBlueprintParam( command=command_bp, name='--fixmotif', prefix='--fixmotif ', position=14, has_no_value = True, \ long_desc='prohibits gene overlap (if not specified: overlaps are allowed)').save() CommandBlueprintParam( command=command_bp, name='--offover', prefix='--offover ', position=15, has_no_value = True, \ short_desc='prohibits gene overlap (if not specified: overlaps are allowed)').save() CommandBlueprintParam( command=command_bp, name='--strand', prefix='--strand ', position=16, default_value='both', \ short_desc='<string> sequence strand to predict genes in (default: both; supported: direct, reverse and both )').save() CommandBlueprintParam( command=command_bp, name='--prok', prefix='--prok ', position=16, has_no_value = True, \ short_desc='same as: --combine --clean --gm', \ long_desc='to run program on prokaryotic sequence or phage with building models for both GeneMark and GeneMark.hmm').save() CommandBlueprintParam( command=command_bp, name='--euk', prefix='--euk ', position=17, has_no_value = True, \ short_desc='same as: --offover --gcode 1 --clean --fixmotif --prestart 6 --width 12 --order 4 --gm', \ long_desc='to run program on eukaryotic intron-less sequence (i.e. low eukaryote)').save() CommandBlueprintParam( command=command_bp, name='--viral', prefix='--viral ', position=18, has_no_value = True, \ short_desc='same as: --combine --gcode 1 --clean --fixmotif --prestart 6 --width 12 --gm', \ long_desc='to run program on a eukaryotic viral genome').save() CommandBlueprintParam( command=command_bp, name='--par', prefix='--par ', position=19, \ short_desc='<file name> custom parameters for GeneMarkS', \ long_desc='default is selected based on gcode value: par_<gcode>.default').save() CommandBlueprintParam( command=command_bp, name='--imod', prefix='--imod ', position=20, \ short_desc='<file name> custom initiation model for GeneMarkS', \ long_desc='default: heuristic model derived from GC composition of input sequence').save() CommandBlueprintParam( command=command_bp, name='--test', prefix='--test ', position=21, has_no_value=True, \ short_desc='installation test').save() CommandBlueprintParam( command=command_bp, name='--verbose', prefix='--verbose ', position=22, has_no_value=True, \ short_desc='prints stderr').save() CommandBlueprintParam( command=command_bp, name='<sequence_file_name>', prefix=None, position=23, is_optional=False, \ short_desc='input sequence in FASTA format' ).save() tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='<sequence_file_name>' ) tool.can_create( filetype_name='GFF3', via_command=command_bp )
def handle(self, *args, **options): tool_name = 'Prodigal' tool_version = '2.60' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format(tool_name, tool_version) ] flow_bp = FlowBlueprint( type='s', name=tool_name ) flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='https://code.google.com/p/prodigal/', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Run prodigal', \ exec_path = tool_settings['exec_path'] ) command_bp.save() command_bp.parents.add( flow_bp ) CommandBlueprintParam( command=command_bp, name='-a', prefix='-a ', position=1, \ short_desc='Write protein translations to the selected file' ).save() CommandBlueprintParam( command=command_bp, name='-c', prefix='-c ', position=2, has_no_value=True, \ short_desc='Closed ends. Do not allow genes to run off edges' ).save() CommandBlueprintParam( command=command_bp, name='-d', prefix='-d ', position=3, \ short_desc='Write nucleotide sequences of genes to the selected file' ).save() ## TODO: limit choices to (gbk, gff, or sco) CommandBlueprintParam( command=command_bp, name='-f', prefix='-f ', position=4, default_value='gbk', \ short_desc='Select output format (gbk, gff, or sco). Default is gbk' ).save() CommandBlueprintParam( command=command_bp, name='-g', prefix='-g ', position=5, default_value='11', \ short_desc='Specify a translation table to use (default 11)' ).save() CommandBlueprintParam( command=command_bp, name='-i', prefix='-i ', position=6, is_optional=False, \ short_desc='Specify input file (default reads from stdin).' ).save() CommandBlueprintParam( command=command_bp, name='-m', prefix='-m ', position=7, has_no_value=True, \ short_desc='Treat runs of Ns as masked sequence and do not build genes across them' ).save() CommandBlueprintParam( command=command_bp, name='-n', prefix='-n ', position=8, has_no_value=True, \ short_desc='Bypass the Shine-Dalgarno trainer and force the program to scan for motifs' ).save() CommandBlueprintParam( command=command_bp, name='-o', prefix='-o ', position=9, is_optional=False, \ short_desc='Specify output file' ).save() CommandBlueprintParam( command=command_bp, name='-p', prefix='-p ', position=10, default_value='single', \ short_desc='Select procedure (single or meta). Default is single.' ).save() CommandBlueprintParam( command=command_bp, name='-s', prefix='-s ', position=11, \ short_desc='Write all potential genes (with scores) to the selected file' ).save() CommandBlueprintParam( command=command_bp, name='-t', prefix='-t ', position=12, \ short_desc='Write or read the specified training file', \ long_desc='Write a training file (if none exists); otherwise, read and use the specified training file' ).save() tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='-i' ) tool.can_create( filetype_name='GenBank Flat File Format', via_command=command_bp, via_params=['-o', '-f=gbk'] ) tool.can_create( filetype_name='GFF3', via_command=command_bp, via_params=['-o', '-f=gff'] )
def handle(self, *args, **options): tool_name = 'Bowtie-build' tool_version = '1.0.0' if self.already_exists(tool_name, tool_version): print("INFO: tool {0} {1} already exists. Skipping.".format(tool_name, tool_version) ) return True settings = configparser.ConfigParser() settings.read( os.path.join( os.path.abspath(os.path.dirname(__file__)), '../../settings.ini') ) tool_settings = settings[ "{0} {1}".format('Bowtie', tool_version) ] flow_bp = FlowBlueprint( type='s', \ description='Bowtie is an ultrafast, memory-efficient short read aligner. It aligns short DNA sequences (reads) to the human genome at a rate of over 25 million 35-bp reads per hour. Bowtie indexes the genome with a Burrows-Wheeler index to keep its memory footprint small: typically about 2.2 GB for the human genome (2.9 GB for paired-end).') flow_bp.save() tool = StandaloneTool( name=tool_name, \ version=tool_version, \ primary_site='http://bowtie-bio.sourceforge.net/index.shtml', \ flow_bp=flow_bp ) tool.save() command_bp = CommandBlueprint( name = 'Build an index for bowtie', \ exec_path = tool_settings['bowtie_build_bin'] ) command_bp.save() command_bp.parents.add( flow_bp ) # bowtie-build [options]* <reference_in> <ebwt_outfile_base> CommandBlueprintParam( command=command_bp, name='-C', prefix='-C ', has_no_value=True, position=1, \ short_desc='Build a colorspace index' ).save() CommandBlueprintParam( command=command_bp, name='-a', prefix='-a ', has_no_value=True, position=2, \ short_desc='Disable automatic -p/--bmax/--dcv memory-fitting' ).save() CommandBlueprintParam( command=command_bp, name='-p', prefix='-p ', has_no_value=True, position=3, \ short_desc='Use packed strings internally; slower, uses less mem' ).save() CommandBlueprintParam( command=command_bp, name='-B', prefix='-B ', has_no_value=True, position=4, \ short_desc='Build both letter- and colorspace indexes' ).save() CommandBlueprintParam( command=command_bp, name='--bmax', prefix='--bmax ', position=5, \ short_desc='Max bucket sz for blockwise suffix-array builder' ).save() CommandBlueprintParam( command=command_bp, name='--bmaxdivn', prefix='--bmaxdivn ', position=6, default_value='4', \ short_desc='Max bucket sz as divisor of ref len' ).save() CommandBlueprintParam( command=command_bp, name='--dcv', prefix='--dcv ', position=7, default_value='1024', \ short_desc='Diff-cover period for blockwise' ).save() CommandBlueprintParam( command=command_bp, name='--nodc', prefix='--nodc ', has_no_value=True, position=8, \ short_desc='Disable diff-cover (algorithm becomes quadratic)' ).save() CommandBlueprintParam( command=command_bp, name='-r', prefix='-r ', has_no_value=True, position=9, \ short_desc='Do not build .3/.4.ebwt (packed reference) portion' ).save() CommandBlueprintParam( command=command_bp, name='-3', prefix='-3 ', has_no_value=True, position=10, \ short_desc='Just build .3/.4.ebwt (packed reference) portion' ).save() CommandBlueprintParam( command=command_bp, name='-o', prefix='-o ', position=11, default_value='5', \ short_desc='SA is sampled every 2^offRate BWT chars' ).save() CommandBlueprintParam( command=command_bp, name='-t', prefix='-t ', position=12, default_value='10', \ short_desc='# of chars consumed in initial lookup' ).save() CommandBlueprintParam( command=command_bp, name='--ntoa', prefix='--ntoa ', has_no_value=True, position=13, \ short_desc='Convert Ns in reference to As' ).save() CommandBlueprintParam( command=command_bp, name='--seed', prefix='--seed ', position=14, \ short_desc='Seed for random number generator' ).save() CommandBlueprintParam( command=command_bp, name='<reference_in>', prefix=None, position=15, is_optional=False, \ short_desc='Input reference FASTA file' ).save() CommandBlueprintParam( command=command_bp, name='<ebwt_outfile_base>', prefix=None, position=16, is_optional=False, \ short_desc='Path to the basename of the ebwt files to be created' ).save() tool.needs( filetype_name='FASTA (nucleotide)', via_command=command_bp, via_param='<reference_in>' ) tool.creates( filetype_name='Bowtie 1.0 index', via_command=command_bp, via_param='<ebwt_outfile_base>' )