def const_seq_table(workflow, output_folder, filtered_dir, mergers_file_path, threads): """ Builds ASV table, removes chimeras, creates read counts at each step, and fasta file with all sequences Args: workflow (anadama2.workflow): an instance of the workflow class output_folder (string): path to output folder filtered_dir (string): path to directory with filtered files mergers_file_path (string): path to rds file that contains merged reads threads (int): number of threads Requires: dada2, tools, seqinr r packages Returns: string: path to rds file that contains ASV data string: path to read counts at each step tsv file string: path to fasta file with all sequences """ read_counts_steps_path = files.SixteenS.path("counts_each_step", output_folder) seqtab_file_path = os.path.join(output_folder, "seqtab_final.rds") seqs_fasta_path = os.path.join(output_folder, "sequences.fasta") readcounts_rds = "Read_counts_filt.rds" asv_tsv = "all_samples_SV_counts.tsv" script_path = utilities.get_package_file("const_seq_table", "Rscript") version_script = utilities.get_package_file("dada2_version", "Rscript") version_command = """echo 'r' `r -e 'packageVersion("dada2")' | grep -C 1 dada2`""" workflow.add_task( "[vars[0]] \ --output_dir=[args[0]]\ --filtered_dir=[args[1]]\ --merged_file_path=[depends[0]]\ --read_counts_steps_path=[targets[0]]\ --readcounts_rds=[vars[2]]\ --asv_tsv=[vars[3]]\ --seqtab_file_path=[targets[1]]\ --seqs_fasta_path=[targets[2]]\ --threads=[vars[1]]", depends = [mergers_file_path,TrackedExecutable("R", version_command="echo '" + version_script + "' `" + version_script + "`")], targets = [read_counts_steps_path, seqtab_file_path, seqs_fasta_path], args = [output_folder, filtered_dir], vars = [script_path, threads, readcounts_rds, asv_tsv ], name = "construct_sequence_table" ) return seqtab_file_path, read_counts_steps_path, seqs_fasta_path
def assign_taxonomy(workflow, output_folder, seqtab_file_path, ref_path, threads): """ Assigns taxonomy using green genes, silva, or rdp database, creates closed reference file Args: workflow (anadama2.workflow): an instance of the workflow class output_folder (string): path to output folder seqtab_file_path (string): path to rds file that contains ASV data ref_path (string): reference database name threads (int): Requires: dada2 r package Returns: string: path to closed reference file """ otu_closed_ref_path = files.SixteenS.path("otu_table_closed_reference", output_folder) # check what reference db to use for taxonomy assignment if ref_path == "unite": refdb_path = config.SixteenS().unite refdb_species_path = "None" elif ref_path == "silva": refdb_path = config.SixteenS().silva_dada2 refdb_species_path = config.SixteenS().silva_species_dada2 elif ref_path == "rdp": refdb_path = config.SixteenS().rdp_dada2 refdb_species_path = config.SixteenS().rdp_species_dada2 else: refdb_path = config.SixteenS().greengenes_dada2 refdb_species_path = "None" script_path = utilities.get_package_file("assign_taxonomy", "Rscript") workflow.add_task( "[vars[2]] \ --output_dir=[args[0]]\ --refdb_path=[vars[0]]\ --refdb_species_path=[vars[1]]\ --seqtab_file_path=[depends[0]]\ --otu_closed_ref_path=[targets[0]]\ --threads=[vars[3]]", depends = [seqtab_file_path], targets = [otu_closed_ref_path], args = [output_folder], vars =[refdb_path, refdb_species_path, script_path, threads], name = "assign_taxonomy" ) return otu_closed_ref_path
def filter_trim(workflow,input_folder,output_folder,maxee,trunc_len_max,pair_id,threads): """ Filters samples by maxee and trims them, renders quality control plots of forward and reverse reads for each sample, creates read counts tsv and rds files. Args: workflow (anadama2.workflow): an instance of the workflow class input_folder (string): path to input folder output_folder (string): path to output folder maxee (string): maxee value to use for filtering trunc_len_max (string): max length for truncating reads pair_id (string): pair identifier threads (int): number of threads Requires: dada2, gridExtra,tools r packages Returns: string: path to file that contains read counts before and after filtering string: path to folder with filtered and trimmed sample files """ reads_plotF_png = files.SixteenS.path("readF_qc", output_folder) reads_plotR_png = files.SixteenS.path("readR_qc", output_folder) readcounts_tsv_path = os.path.join(output_folder, "Read_counts_after_filtering.tsv") readcounts_rds_path = os.path.join(output_folder, "Read_counts_filt.rds") filtered_dir = "filtered_input" script_path = utilities.get_package_file("filter_and_trim", "Rscript") workflow.add_task( "[vars[0]] \ --input_dir=[args[0]]\ --output_dir=[args[1]]\ --filtered_dir=[vars[1]]\ --maxee=[args[2]]\ --trunc_len_max=[args[3]]\ --readcounts_tsv_path=[targets[0]]\ --readcounts_rds_path=[targets[1]]\ --reads_plotF=[targets[2]]\ --reads_plotR=[targets[3]]\ --pair_id=[args[4]]\ --threads=[args[5]]", depends =[TrackedDirectory(input_folder)], targets = [readcounts_tsv_path, readcounts_rds_path, reads_plotF_png, reads_plotR_png], args = [input_folder, output_folder, maxee, trunc_len_max, pair_id, threads], vars = [script_path,filtered_dir], name ="filter_and_trim" ) return readcounts_tsv_path, filtered_dir
def learn_error(workflow, output_folder, filtered_dir, readcounts_tsv_path, threads): """ Learns error rates for each sample, renders error rates plots for forward and reverse reads Args: workflow (anadama2.workflow): an instance of the workflow class output_folder (string): path to output folder filtered_dir (string): path to directory with filtered files readcounts_tsv_path (string): path to read counts after filtering tsv file threads (int): number of threads Requires: dada2, ggplot2 r packages Returns: string: path to file that contains error rates of forward reads string: path to file that contains error rates of reverse reads """ error_ratesF_png = files.SixteenS.path("error_ratesF", output_folder) error_ratesR_png = files.SixteenS.path("error_ratesR", output_folder) error_ratesF_path= os.path.join(output_folder, "error_ratesFWD.rds") error_ratesR_path =os.path.join(output_folder, "error_ratesREV.rds") script_path = utilities.get_package_file("learn_error_rates", "Rscript") workflow.add_task( "[vars[0]] \ --output_dir=[args[0]]\ --filtered_dir=[args[1]]\ --error_ratesF_png=[targets[0]]\ --error_ratesR_png=[targets[1]]\ --error_ratesF_path=[targets[2]]\ --error_ratesR_path=[targets[3]]\ --threads=[vars[1]]", depends = [readcounts_tsv_path], targets = [error_ratesF_png, error_ratesR_png, error_ratesF_path, error_ratesR_path], args = [output_folder, filtered_dir], vars = [script_path, threads], name = "learn_error_rates" ) return error_ratesF_path, error_ratesR_path
def merge_paired_ends(workflow, output_dir, filtered_dir, error_ratesF_path, error_ratesR_path, threads, minoverlap, maxmismatch): """ Dereplicates and merges paired reads Args: workflow (anadama2.workflow): an instance of the workflow class output_folder (string): path to output folder filtered_dir (string): path to directory with filtered files error_ratesF_path (string): path to rds file that contains error rates of forward reads error_ratesR_path (string): path to rds file that contains error rates of reverse reads threads (int): number of threads minoverlap (int): the min number of pairs for overlap for the merge step maxmismatch (int): the max number of mismatch for pairs to merge Requires: dada2, tools r packages Returns: string: path to rds file that contains merged and dereplicated reads """ mergers_file_path = os.path.join(output_dir, "mergers.rds") script_path = utilities.get_package_file("merge_paired_ends", "Rscript") workflow.add_task( "[vars[0]] \ --output_dir=[args[0]]\ --filtered_dir=[args[1]]\ --error_ratesF_path=[depends[0]]\ --error_ratesR_path=[depends[1]]\ --mergers_file_path=[targets[0]]\ --threads=[vars[1]]\ --minoverlap=[args[2]]\ --maxmismatch=[args[3]]", depends = [error_ratesF_path, error_ratesR_path], targets = [mergers_file_path], args = [output_dir, filtered_dir, minoverlap, maxmismatch], vars = [script_path, threads], name = "dereplicate_and_merge" ) return mergers_file_path
workflow.add_argument("exclude-workflow-info",desc="do not include data processing task info in report", action="store_true") workflow.add_argument("format",desc="the format for the report", default="pdf", choices=["pdf","html"]) # get the arguments from the command line args = workflow.parse_args() otu_table = files.SixteenS.path("otu_table_closed_reference",args.input, error_if_not_found=True) # read and label the metadata metadata=None metadata_labels=None if args.input_metadata: metadata=utilities.read_metadata(args.input_metadata, otu_table, ignore_features=args.metadata_exclude, otu_table=True) metadata_labels, metadata=utilities.label_metadata(metadata, categorical=args.metadata_categorical, continuous=args.metadata_continuous) templates=[utilities.get_package_file("16S")] log_file=None # add the template for the data processing information if not args.exclude_workflow_info: log_file=files.Workflow.path("log", args.input, error_if_not_found=True) # identify method and list the required and optional files for the workflow # these are expected to be included in the input folder # for dada2/its workflow if os.path.isfile(files.SixteenS.path("error_ratesF", args.input, error_if_not_found=False)): method = "dada2" if os.path.isdir(files.SixteenS.path("filtN", args.input, error_if_not_found=False)): method = "its" doc_title = method.upper() + " 16s Report"
if args.random_effects: additional_stats_tasks, permanova_plots = utilities.run_permanova( workflow, args.static_covariates, maaslin_tasks_info, args.input_metadata, args.scale, args.min_abundance, args.min_prevalence, args.permutations, args.output, additional_stats_tasks) else: additional_stats_tasks, beta_diversity_plots, covariate_equation = utilities.run_beta_diversity( workflow, maaslin_tasks_info, args.input_metadata, args.min_abundance, args.min_prevalence, args.max_missing, [args.multivariable_fixed_effects, args.fixed_effects], args.output, additional_stats_tasks, args.random_effects, metadata_variables, args.adonis_method) templates = [ utilities.get_package_file("header"), utilities.get_package_file("stats") ] # add the document to the workflow doc_task = workflow.add_document( templates=templates, depends=maaslin_tasks + stratified_plots_tasks + [taxonomic_profile] + additional_stats_tasks, targets=workflow.name_output_files("stats_report." + args.format), vars={ "title": "Stats Report", "project": args.project_name, "introduction_text": args.introduction_text, "taxonomic_profile": taxonomic_profile, "maaslin_tasks_info": maaslin_tasks_info,
wmgx_qc_counts = files.ShotGun.path("kneaddata_read_counts", wmgx_input_folder, error_if_not_found=True) wmtx_qc_counts = files.ShotGun.path("kneaddata_read_counts", wmtx_input_folder, error_if_not_found=True) taxonomic_profile = files.ShotGun.path("taxonomic_profile", wmgx_input_folder, error_if_not_found=True) pathabundance = files.ShotGun.path("pathabundance_relab", wmgx_input_folder, error_if_not_found=True) # get the templates for the report templates = [ utilities.get_package_file("header"), utilities.get_package_file("quality_control_paired_dna_rna"), utilities.get_package_file("taxonomy"), utilities.get_package_file("functional_dna_rna") ] # add the template for the data processing information log_file = None if not args.exclude_workflow_info: templates += [utilities.get_package_file("workflow_info")] log_file = files.Workflow.path("log", args.input, error_if_not_found=True) # add the document to the workflow doc_task = workflow.add_document( templates=templates, depends=[wmgx_qc_counts, wmtx_qc_counts, taxonomic_profile, pathabundance],
def remove_primers(workflow,fwd_primer,rev_primer,input_folder,output_folder,pair_id,threads): """ Identifies primers and N filters samples Args: workflow (anadama2.workflow): an instance of the workflow class input_folder (string): path to input folder output_folder (string): path to output folder fwd_primer (string): forward primer rev_primer (string): reverse primer pair_id (string): pair identifier threads (string): number of threads Requires: dada2, Biostrings, ShortRead, tools r packages Returns: string: path to folder with primers removed files """ script_path = utilities.get_package_file("identify_primers", "Rscript") filtN_folder = os.path.join(output_folder,"filtN") primers_folder = os.path.join(output_folder,"primers") fwd_primer_file = os.path.join(primers_folder,"fwd_primer_file.txt") rev_primer_file = os.path.join(primers_folder,"rev_primer_file.txt") cutadapt_folder = os.path.join(output_folder, "cutadapt") # run identify primers task workflow.add_task( "[vars[0]] \ --input_dir=[args[3]] \ --filtn_dir=[vars[1]] \ --primers_dir=[vars[2]] \ --threads=[args[4]] \ --fwd_primer_file=[targets[0]] \ --rev_primer_file=[targets[1]] \ --fwd_primer=[args[0]] \ --rev_primer=[args[1]] \ --pair_id=[args[2]]", targets=[fwd_primer_file,rev_primer_file, TrackedDirectory(filtN_folder)], args=[fwd_primer, rev_primer, pair_id,input_folder,threads], vars=[script_path,filtN_folder,primers_folder,output_folder], name="identify_primers" ) pair_id2 = pair_id.replace("1", "2",1) fwd_files = sorted(fnmatch.filter(os.listdir(input_folder), "*"+pair_id+"*.fastq*")) rev_files = sorted(fnmatch.filter(os.listdir(input_folder), "*" + pair_id2 + "*.fastq*")) #run cutadapt to remove primers for i in range(0,len(fwd_files)): fwd_file=os.path.join(input_folder,fwd_files[i]) rev_file = os.path.join(input_folder, rev_files[i]) workflow.add_task( cutadapt_do, depends=[fwd_primer_file, rev_primer_file, fwd_file, rev_file, TrackedDirectory(filtN_folder), TrackedExecutable("cutadapt",version_command="echo 'cutadapt' `cutadapt --version`")], targets=[TrackedDirectory(cutadapt_folder)], name="remove_primers" ) return cutadapt_folder