def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: AnaDAMA2.Workflow: The workflow object for this pipeline. AnaDAMA2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='0.1', description='A workflow to handle HMP2 ' 'Proteomics data.', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('checksums-file', desc='MD5 checksums for files ' 'found in the supplied input directory.') workflow.add_argument( 'data_specific_metadata', desc='A collection of ' 'dataset specific metadata that should be integrated ' 'with any analysis output (creating a PCL file).') return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: anadama2.Workflow: The workflow object for this pipeline anadama2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='0.1', description='A workflow to handle ' 'uploading metadata and data files to the Data ' 'Coordination Center (DCC)', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('metadata-file', desc='Accompanying metadata file ' 'for the provided data files.', default=None) workflow.add_argument('baseline-metadata-file', desc='Metadata file ' 'containing baseline visit metadata per subject.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: AnaDAMA2.Workflow: The workflow object for this pipeline. AnaDAMA2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='0.1', description='A workflow to handle HMP2 ' 'host exome data.', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('threads', desc='Number of threads to use in ' 'workflow processing', default=1) return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: anadama2.Workflow: The workflow object for this pipeline anadama2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='1.0', description='A workflow to handle visualization ' 'of HMP2 16S data.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('metadata-file', desc='Accompanying metadata file ' 'for the provided data files.', default=None) workflow.add_argument('source', desc='The source of the output files generated. ' '[biobakery, CMMR]', default='biobakery', choices=['biobakery', 'CMMR']) return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: anadama2.Workflow: The workflow object for this pipeline anadama2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='1.0', description='A workflow to handle visualization ' 'of HMP2 Metaviromics data.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('metadata-file', desc='Accompanying metadata file ' 'for the provided data files.', default=None) return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: anadama2.Workflow: The workflow object for this pipeline anadama2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='0.1', description='A workflow to handle ' 'analysis of 16S amplicon data.', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('metadata-file', desc='Accompanying metadata file ' 'for the provided data files.', default=None) workflow.add_argument('threads', desc='number of threads/cores for each ' 'task to use', default=1) return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: AnaDAMA2.Workflow: The workflow object for this pipeline. AnaDAMA2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='0.1', description='A workflow to handle HMP2 ' 'Metabolomic data.', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('metadata-file', desc='Accompanying metadata file ' 'for the provided data files.', default=None) workflow.add_argument('aux_metadata', desc='Any additional metadata ' 'files that can supply metadata for our ouptut ' 'PCL files.') return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: anadama2.Workflow: The workflow object for this pipeline anadama2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='0.1', description='A workflow to assemble ' 'metagenomic data and run a gene caller on the ' 'resulting contigs.') workflow.add_argument('contaminant-db', desc='KneadData DNA contaminants database.') workflow.add_argument('file-extension', desc='Extension of input files to ' 'assemble and gene call on.', default='.fastq.gz') workflow.add_argument('threads', desc='number of threads/cores for each ' 'task to use', default=1) workflow.add_argument('memory', desc='The amount of memory to use for each ' 'assembly job. Provided in GB', default='10240') return workflow
if new_sum.lower() == md5sum.lower(): file_handle = open(task.targets[0].name, "w") file_handle.write("Match") file_handle.close() else: error_msg = "ERROR: Sums do not match for file {0}\nComputed Sum: {1}\nExpected Sum: {2}".format( task.depends[1].name, new_sum, md5sum) sys.stderr.write(error_msg) raise Exception(error_msg) # create a workflow and get the arguments workflow = Workflow() workflow.add_argument("input-metadata", desc="the metadata file", required=True) workflow.add_argument("input-extension", desc="the input file extension", required=True) args = workflow.parse_args() # get all of the input files input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) sample_names = utilities.sample_names(input_files, args.input_extension) # for each raw input file, generate an md5sum file md5sum_outputs = [ os.path.join(args.output, output_file_name) + ".md5sum"
from anadama2 import Workflow from biobakery_workflows.tasks import dadatwo workflow = Workflow() workflow.add_argument("fwd-primer", desc="forward primer, required for its workflow",required=True) workflow.add_argument("rev-primer", desc="reverse primer, required for its workflow",required=True) workflow.add_argument("pair-identifier", desc="the string to identify the first file in a pair", default="_R1_001") workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=1) args = workflow.parse_args() dadatwo.remove_primers(workflow,args.fwd_primer,args.rev_primer,args.input,args.output,args.pair_identifier,args.threads) workflow.go()
# import the library of biobakery_workflow tasks for shotgun sequences from biobakery_workflows.tasks import shotgun, general # import the utilities functions and config settings from biobakery_workflows from biobakery_workflows import utilities, config # create a workflow instance, providing the version number and description # the version number will appear when running this script with the "--version" option # the description will appear when running this script with the "--help" option workflow = Workflow(version="0.1", description="A workflow for isolate assembly") # add the custom arguments to the workflow workflow_config = config.ShotGun() workflow.add_argument("species-name", desc="the species name", required=True) workflow.add_argument("input-extension", desc="the input file extension", default="fastq.gz", choices=["fastq.gz", "fastq", "fq.gz", "fq"]) workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=1) workflow.add_argument("pair-identifier", desc="the string to identify the first file in a pair", default="_R1_001") workflow.add_argument( "reference-database", desc="the path to the reference database for quality assessment", default="") workflow.add_argument("dbcan-path",
# import the task to convert from biom to tsv from biobakery_workflows.tasks.sixteen_s import convert_from_biom_to_tsv_list # import the files for descriptions and paths from biobakery_workflows import files # create a workflow instance, providing the version number and description workflow = Workflow( version="0.1", remove_options=["input"], description="A workflow for stats on wmgx and 16s data sets") # add the custom arguments to the workflow workflow.add_argument( "input", desc="the folder containing taxonomy and functional data files", required=True) # add the custom arguments to the workflow workflow.add_argument("project-name", desc="the name of the project", required=True) workflow.add_argument("input-metadata", desc="the metadata file (samples as columns or rows)", required=True) workflow.add_argument( "transform", desc= "the transform to apply to the data with MaAsLin2 (default is the MaAsLin2 default transform)", default="") workflow.add_argument(
def parse_cli_arguments(): ''' Parses any command-line arguments passed into the workflow. the version number will appear when running this script with the "--version" option the description will appear when running this script with the "--help" option create a workflow instance, providing the version number and description ''' tmp_output = os.path.abspath(config.working_dir) workflow = Workflow( version=VERSION, description="A workflow for MetaWIBELE characterization", remove_options=["output"]) # add the custom arguments to the workflow workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=None) workflow.add_argument( "characterization-config", desc="the configuration file of characterization analysis", default=None) workflow.add_argument( "mspminer-config", desc= "the configuration file used by mspminer; [no] skip MSPminer-based taxonomic assignment", default=None) workflow.add_argument("bypass-clustering", desc="do not cluster proteins into protein families", action="store_true") workflow.add_argument( "bypass-global-homology", desc= "do not annotate protein families based on global homology information", action="store_true") workflow.add_argument( "bypass-domain-motif", desc= "do not annotate protein families based on domain/motif information", action="store_true") workflow.add_argument( "bypass-interproscan", desc="do not annotate protein families based on interproscan", action="store_true") workflow.add_argument( "bypass-pfam_to_go", desc="do not annotate protein families based on pfam2go", action="store_true") workflow.add_argument( "bypass-domine", desc="do not annotate protein families based on DOMINE database", action="store_true") workflow.add_argument( "bypass-sifts", desc="do not annotate protein families based on SIFTS database", action="store_true") workflow.add_argument( "bypass-expatlas", desc= "do not annotate protein families based on Expression Atlas database", action="store_true") workflow.add_argument( "bypass-psortb", desc="do not annotate protein families based on psortb", action="store_true") workflow.add_argument( "bypass-abundance", desc="do not annotate protein families based on abundance information", action="store_true") workflow.add_argument( "bypass-mspminer", desc="do not annotate protein families based on MSPminer", action="store_true") workflow.add_argument( "bypass-maaslin", desc="do not annotate protein families based on MaAsLin2", action="store_true") workflow.add_argument( "split-number", desc= "indicates number of spliting files for annotation based on sequence information", default=None) workflow.add_argument( "bypass-integration", desc="do not integrate annotations for protein families", action="store_true") workflow.add_argument("study", desc="specify the study name", default=None) workflow.add_argument("basename", desc="specify the basename for output files", default=None) workflow.add_argument( "input-sequence", desc= "input the sequence file for gene families (non-redundant catalogs)", required=True) workflow.add_argument( "input-count", desc="input the count file for gene families (non-redundant catalogs)", required=True) workflow.add_argument("input-metadata", desc="input the metadata file", required=True) workflow.add_argument( "output", desc= "provide an output folder which the workflow database and log is written. By default, thet be written to the anadama2 folder of users' working directory", default=tmp_output) return workflow
from anadama2 import Workflow # import the library of biobakery_workflow tasks for shotgun sequences from biobakery_workflows.tasks import shotgun, general # import the utilities functions and config settings from biobakery_workflows from biobakery_workflows import utilities, config # create a workflow instance, providing the version number and description # the version number will appear when running this script with the "--version" option # the description will appear when running this script with the "--help" option workflow = Workflow(version="0.1", description="A workflow to run strainphlan") # add the custom arguments to the workflow workflow_config = config.ShotGun() workflow.add_argument("input-extension", desc="the input file extension", default="fastq.gz", choices=["fastq.gz","fastq","fq.gz","fq","fasta","fasta.gz"]) workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=1) workflow.add_argument("bypass-taxonomic-profiling", desc="do not run the taxonomic profiling tasks (a tsv profile for each sequence file must be included in the input folder using the same sample name)", action="store_true") workflow.add_argument("strain-profiling-options", desc="additional options when running the strain profiling step", default="") workflow.add_argument("max-strains", desc="the max number of strains to profile", default=20, type=int) # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line # return an error if no files are found input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) ### STEP #1: Run taxonomic profiling on all of the filtered files ### if not args.bypass_taxonomic_profiling: merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow,
from biobakery_workflows import document_templates, utilities # import the files for descriptions and paths from biobakery_workflows import files import os # create a workflow instance, providing the version number and description # remove the input folder option as it will be replaced with multiple input files workflow = Workflow(version="0.1", remove_options=["input"], description="A workflow for 16S visualization") # add the custom arguments to the workflow # create a custom description for the input argument listing all expected input files input_desc="A folder containing the final products from the 16s data workflow.\n\nThe input folder must include the following:\n\n" workflow.add_argument("input",desc=input_desc,required=True) # add the custom arguments to the workflow workflow.add_argument("project-name",desc="the name of the project",required=True) workflow.add_argument("input-metadata",desc="the metadata file (samples as columns or rows)") workflow.add_argument("input-picard",desc="the folder of picard quality score files") workflow.add_argument("input-picard-extension",desc="the extensions for the picard quality score files", default="quality_by_cycle_metrics") workflow.add_argument("metadata-categorical",desc="the categorical features", action="append", default=[]) workflow.add_argument("metadata-continuous",desc="the continuous features", action="append", default=[]) workflow.add_argument("metadata-exclude",desc="the features to exclude", action="append", default=[]) workflow.add_argument("exclude-workflow-info",desc="do not include data processing task info in report", action="store_true") workflow.add_argument("format",desc="the format for the report", default="pdf", choices=["pdf","html"]) # get the arguments from the command line args = workflow.parse_args()
import os from anadama2 import Workflow # to run provide the new workflow run input and output folders # $ python anadama2_add_files_to_database.py --input $NEW_INPUT_FOLDER --output $NEW_OUTPUT_FOLDER workflow = Workflow() # add the list of possible file extensions workflow.add_argument( "input-extensions", desc="the comma-delimited list of extensions of the input files", default="txt,tsv,fastq,fastq.gz,log,sam") args = workflow.parse_args() # get all of the files in the input folder with the extensions provided def get_files_to_add(input_folder): posible_extensions = set(args.input_extensions.split(",")) input_files = [] for folder, directories, files in os.walk(input_folder): if not ".anadama" in folder: for filename in files: if any( map(lambda ext: filename.endswith(ext), posible_extensions)): input_files.append(os.path.join(folder, filename)) return input_files
#!/usr/bin/env python from anadama2 import Workflow import os workflow = Workflow(version="0.0.2", description="A workflow to run PanPhlAn") workflow.add_argument("threads", default=1, desc="number of threads for panphlan to use") workflow.add_argument("dbfolder", default=None, desc="folder containing database") workflow.add_argument("filesfile", default=None, desc="file with filepaths to run on (relative to input)") workflow.add_argument("ref", default=None, desc="name of reference db") workflow.add_argument( "refs", default=None, desc="file with list of references (relative to dbfolder)") args = workflow.parse_args() in_files = workflow.get_input_files(".fastq.gz") out_files = workflow.name_output_files(name=in_files, tag="panphlan_map", extension="csv.bz2") if args.filesfile: with open(args.filesfile) as f:
["genefamilies_norm_ratio", "ecs_norm_ratio", "paths_norm_ratio"] } # create a custom description for the input argument listing all expected input files input_desc = "A folder containing the final products from the wmgx_mwtx data workflow.\n\nThe input folder should include the following:\n\n" input_desc += "Whole Metagenome Shotgun\n---------------------------------\n" input_desc += files.ShotGun.list_file_path_description( files.ShotGun.wmgx_folder_name, wmgx_input_files) input_desc += "\n\nWhole Metatranscriptome Shotgun\n---------------------------------\n" input_desc += files.ShotGun.list_file_path_description( files.ShotGun.wmtx_folder_name, wmtx_input_files) input_desc += "\n\nRNA/DNA Norm\n---------------------------------\n" input_desc += files.ShotGun.list_file_path_description("", norm_input_files) # add the custom arguments to the workflow workflow.add_argument("input", desc=input_desc, required=True) workflow.add_argument("project-name", desc="the name of the project", required=True) workflow.add_argument( "introduction-text", desc="the text to include in the intro of the report", default= "The data was run through the standard workflow for whole metagenome and metatranscriptome shotgun sequencing." ) workflow.add_argument( "exclude-workflow-info", desc="do not include data processing task info in report", action="store_true") workflow.add_argument("format", desc="the format for the report",
# The fasta reads will be any of the sample reads that map to a marker associated with one of the # species in the "--species-list" file. This file should have one species per line and be formatted # with the metaphlan2 species naming convention. More specifically, the species file should list # one per line with metaphlan2 format (ie "s__Gemella_sanguinis") and for unknown species # include the genus in this file (ie "s__Gemella_unclassified" should be included in the file as "g__Gemella"). # The metaphlan2 pkl database is also required for this script to run and can be provided # with the option "--pkl-database". SAM_READ_NAME_INDEX = 0 SAM_REFERENCE_NAME_INDEX = 2 SAM_SEQ_INDEX = 9 workflow = Workflow() # input folder should have sam alignment files from metaphlan2 run workflow.add_argument("pkl-database", desc="MetaPhlAn2 pkl database", default="metaphlan2_db/mpa_v20_m200.pkl") workflow.add_argument("species-list", desc="the list of species to pull reads for", default="species_list.txt") workflow.add_argument("input-tag-extension", desc="the file name tag and extension", default="_bowtie2.sam") args = workflow.parse_args() def find_reads(task): # read in the species with open(args.species_list) as file_handle: species_list = [taxon.rstrip() for taxon in file_handle.readlines()] db = pickle.load(bz2.BZ2File(args.pkl_database, 'r')) marker_to_species={} for marker,info in db['markers'].items(): if info['clade'] in species_list: marker_to_species[marker]=info['clade']
import datetime # constants ARCHIVE_FOLDER = "/opt/archive_folder/" COUNT_FILE = os.path.join(ARCHIVE_FOLDER, "data_deposition_counts.csv") PUBLIC_COUNT_FILE = os.path.join(ARCHIVE_FOLDER, "data_deposition_counts_public.csv") # create a workflow to check the md5sums for each file from anadama2 import Workflow from biobakery_workflows import utilities # create a workflow and get the arguments workflow = Workflow(remove_options=["input"]) workflow.add_argument("input-upload", desc="the folder of raw uploaded data", required=True) workflow.add_argument("input-processed", desc="the folder of processed data", required=True) workflow.add_argument("key", desc="the key file to use for the transfer", required=True) workflow.add_argument("user", desc="the user id for the transfer", required=True) workflow.add_argument("remote", desc="the remote host name for the transfer", required=True) workflow.add_argument("study", desc="the name of the study", required=True) workflow.add_argument("output-transfer",
#!/usr/bin/env python from anadama2 import Workflow import os workflow = Workflow(version="0.0.1", description="A workflow to run PanPhlAn") workflow.add_argument("dbfolder", default=None, desc="folder containing database") workflow.add_argument("ref", default=None, desc="name of reference db") workflow.add_argument( "refs", default=None, desc="file with list of references (relative to dbfolder)") args = workflow.parse_args() cmd = "panphlan_profile.py -c {0} -i {0}/ --o_dna [target] --add_strains" if args.dbfolder: cmd += " --i_bowtie2_indexes {}".format(args.dbfolder) if args.ref: refs = [args.ref] elif args.refs: r = open(args.refs, "r") refs = [l.strip() for l in r] r.close() for ref in refs:
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: anadama2.Workflow: The workflow object for this pipeline anadama2.cli.Configuration: Arguments passed into this workflow. """ workflow = Workflow(version='1.0', description='A workflow to handle HMP2 ' 'WGS data.', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('metadata-file', desc='Accompanying metadata file ' 'for the provided data files.', default=None) workflow.add_argument('threads', desc='number of threads/cores for each ' 'task to use', default=1) workflow.add_argument('threads-kneaddata', desc='OPTIONAL. A specific ' 'number of threads/cores to use just for the ' 'kneaddata task.', default=None) workflow.add_argument('threads-metaphlan', desc='OPTIONAL. A specific ' 'number of threads/cores to use just for the ' 'metaphlan2 task.', default=None) workflow.add_argument( 'threads-humann', desc='OPTIONAL. A specific ' 'number of threads/cores to use just for the humann2 ' 'task.', default=None) return workflow
""" from anadama2 import Workflow import os, sys, fnmatch from biobakery_workflows.tasks import sixteen_s, dadatwo, general from biobakery_workflows import utilities, config, files # create a workflow instance, providing the version number and description workflow = Workflow(version="0.1", description="A workflow for 16S sequencing data") # add the custom arguments to the workflow workflow_config = config.SixteenS() workflow.add_argument("method", desc="method to process 16s workflow", default="vsearch", choices=["usearch", "dada2", "vsearch", "its"]) workflow.add_argument("dada-db", desc="reference database for dada2 workflow", default="silva", choices=["gg", "rdp", "silva", "unite"]) workflow.add_argument( "usearch-db", desc= "full paths for the reference databases (fna and taxonomy, comma delimited) for the usearch workflow", default=",".join([ workflow_config.greengenes_fasta, workflow_config.greengenes_taxonomy ])) workflow.add_argument("bypass-functional-profiling", desc="bypass the functional profiling tasks", action="store_true")
import os from glob import glob from anadama2 import Workflow from anadama2.tracked import TrackedExecutable # Setting the version of the workflow and short description workflow = Workflow( version="0.0.1", #Update the version as needed description="Analysis Template" #Update the description as needed ) # Setting additional custom arguments for workflow - run.py workflow.add_argument(name="lines", desc="Number of lines to trim [default: 10]", default="10") workflow.add_argument( name="metadata", desc="Metadata for performing analysis [default: input/metadata.tsv]", default="input/metadata.tsv") # Parsing the workflow arguments args = workflow.parse_args() #Loading the config setting args.config = 'etc/config.ini' # AnADAMA2 example workflow.do workflow.do("ls /usr/bin/ | sort > [t:output/global_exe.txt]") #Command workflow.do("ls $HOME/.local/bin/ | sort > [t:output/local_exe.txt]") #Command
# import the utilities functions and config settings from biobakery_workflows from biobakery_workflows import utilities, config # create a workflow instance, providing the version number and description # the version number will appear when running this script with the "--version" option # the description will appear when running this script with the "--help" option workflow = Workflow( version="0.1", description="A workflow for whole metagenome shotgun sequences") # add the custom arguments to the workflow workflow_config = config.ShotGun() workflow.add_argument("input-extension", desc="the input file extension", default="fastq.gz", choices=[ "fastq.gz", "fastq", "fq.gz", "fq", "fasta", "fasta.gz", "fastq.bz2", "fq.bz2" ]) workflow.add_argument("barcode-file", desc="the barcode file", default="") workflow.add_argument("dual-barcode-file", desc="the string to identify the dual barcode file", default="") workflow.add_argument("index-identifier", desc="the string to identify the index files", default="_I1_001") workflow.add_argument( "min-pred-qc-score", desc="the min phred quality score to use for demultiplexing", default=2) workflow.add_argument("threads",
# -*- coding: utf-8 -*- from anadama2 import Workflow # create a workflow instance, providing the version number and description # the version number will appear when running this script with the "--version" option # the description will appear when running this script with the "--help" option workflow = Workflow(version="0.1", description="A workflow to run KneadData") # add the custom arguments to the workflow workflow.add_argument("kneaddata-db", desc="the kneaddata database", default="/work/code/kneaddata/db/") workflow.add_argument("input-extension", desc="the input file extension", default="fastq") workflow.add_argument("threads", desc="number of threads for knead_data to use", default=1) # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line in_files = workflow.get_input_files(extension=args.input_extension) # get a list of output files, one for each input file, with the kneaddata tag out_files = workflow.name_output_files(name=in_files, tag="kneaddata") # create a task for each set of input and output files to run kneaddata workflow.add_task_group( "kneaddata --input [depends[0]] --output [output_folder] --reference-db [kneaddata_db] --threads [threads]", depends=in_files, targets=out_files, output_folder=args.output, kneaddata_db=args.kneaddata_db, threads=args.threads)
# species in the "--species-list" file. This file should have one species per line and be formatted # with the metaphlan species naming convention. More specifically, the species file should list # one per line with metaphlan format (ie "s__Gemella_sanguinis") and for unknown species # include the genus in this file (ie "s__Gemella_unclassified" should be included in the file as "g__Gemella"). # The metaphlan pkl database is also required for this script to run and can be provided # with the option "--pkl-database". SAM_READ_NAME_INDEX = 0 SAM_REFERENCE_NAME_INDEX = 2 SAM_SEQ_INDEX = 9 workflow = Workflow() # input folder should have sam alignment files from metaphlan run workflow.add_argument("pkl-database", desc="MetaPhlAn pkl database", default="metaphlan_db/mpa_v30_CHOCOPhlAn_201901.pkl") workflow.add_argument("species-list", desc="the list of species to pull reads for", default="species_list.txt") workflow.add_argument("input-tag-extension", desc="the file name tag and extension", default="_bowtie2.sam") args = workflow.parse_args() def find_reads(task): # read in the species with open(args.species_list) as file_handle: species_list = [taxon.rstrip() for taxon in file_handle.readlines()]
def parse_cli_arguments (): ''' Parses any command-line arguments passed into the workflow. the version number will appear when running this script with the "--version" option the description will appear when running this script with the "--help" option create a workflow instance, providing the version number and description ''' tmp_output = os.path.abspath(config.working_dir) workflow = Workflow(version = VERSION, description = "A workflow for MetaWIBELE prioritization", remove_options=["output"]) # add the custom arguments to the workflow workflow.add_argument("threads", desc = "number of threads/cores for each task to use", default = None) workflow.add_argument("prioritization-config", desc = "the configuration file for prioritization", default = None) workflow.add_argument("vignette-config", desc = "the file with specific functions of interest used as binary filtering for prioritization", default = "none") workflow.add_argument("bypass-mandatory", desc = "do not prioritize protein families based on quantitative criteria (mandatory prioritization)", action = "store_true") workflow.add_argument("bypass-optional", desc = "do not prioritize protein families based on selecting our for interested annotations (optional prioritization)", action = "store_true") workflow.add_argument("bypass-finalized", desc = "do not finalize prioritized protein families", action = "store_true") workflow.add_argument("selected-output", desc = "the output file name for the prioritized protein families by binary filtering", default = None) workflow.add_argument("basename", desc="specify the basename for output files", default = None) workflow.add_argument("input-annotation", desc = "provide the annotation file for protein families", required = True) workflow.add_argument("input-attribute", desc = "provide the annotation attribute file for protein families", required = True) workflow.add_argument("output", desc = "provide an output folder which the workflow database and log is written. By default, thet be written to the anadama2 folder of users' workding directory", default = tmp_output) return workflow
def parse_cli_arguments(): """Parses any command-line arguments passed into the workflow. Args: None Requires: None Returns: AnaDAMA2.Workflow: The workflow object for this pipeline. """ workflow = Workflow(version='0.1', description='A workflow to handle ' 'refreshing and disseminating HMP2 metadata.', remove_options=['input', 'output']) workflow.add_argument('manifest-file', desc='Manifest file containing ' 'files to process in this workflow run.') workflow.add_argument('config-file', desc='Configuration file ' 'containing parameters required by the workflow.') workflow.add_argument('metadata-file', desc='If an existing metadata ' 'file exists it can be supplied here. This metadata ' 'file will be appended to instead of a whole new ' 'metadata file being generated.') workflow.add_argument('studytrax-metadata-file', desc='Accompanying ' 'StudyTrax data all corresponding samples in the ' 'HMP2 project.') workflow.add_argument('broad-sample-tracking-file', desc='Broad Institute ' 'sample tracking spreadsheet containing status of ' 'sequence products generated.') workflow.add_argument('proteomics-metadata', desc='PNNL-supplied metadata ' 'spreadsheet.') workflow.add_argument('auxillary-metadata', action='append', default=[], desc='Any auxillary metadata to be appeneded ' 'to the final metadata table.') return workflow
def parse_cli_arguments(): ''' Parses any command-line arguments passed into the workflow. the version number will appear when running this script with the "--version" option the description will appear when running this script with the "--help" option create a workflow instance, providing the version number and description ''' workflow = Workflow( version=VERSION, description= "A workflow to preprocess shotgun sequencing reads of metagenomes " "with tasks of metagenomic assembly, gene calling, " "building gene catalogs and generating gene abundance for each sample." ) # add the custom arguments to the workflow workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=None) workflow.add_argument( "extension-paired", desc= "provide the extension for paired fastq files using comma to separate, e.g. .R1.fastq.gz,.R2.fastq.gz | .R1.fastq,.R2.fastq", default=None) workflow.add_argument("extension", desc="provide the extension for all fastq files", choices=[".fastq.gz", ".fastq"], default=".fastq.gz") workflow.add_argument("gene-call-type", desc="specify which type of gene calls will be used", choices=['prokka', 'prodigal', 'both'], default='prodigal') workflow.add_argument("bypass-assembly", desc="do not run assembly", action="store_true") workflow.add_argument("bypass-gene-calling", desc="do not call ORFs", action="store_true") workflow.add_argument("bypass-gene-catalog", desc="do not build gene catalogs", action="store_true") workflow.add_argument("output-basename", desc="provide the basename for output files", default=None) return workflow