def sort_by_size(workflow, method, fasta_file, output_folder, min_size): """ Sort reads by size, removing those that are not of min size Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fasta_file (string): The path to the fasta file (filtered and dereplicated). output_folder (string): The path of the output folder. min_size (int): Min size of the reads to filter. Requires: usearch or vsearch Returns: list: Path to the fasta file sorted by size """ # get the name of the output files output_file = utilities.name_files("all_samples_sorted.fasta", output_folder) if method == "vsearch": workflow.add_task( "vsearch --sortbysize [depends[0]] --output [targets[0]] --minsize [args[0]]", depends=[fasta_file, TrackedExecutable("vsearch")], targets=output_file, args=min_size, name="vsearch_sortbysize") else: workflow.add_task( "usearch -sortbysize [depends[0]] -fastaout [targets[0]] -minsize [args[0]]", depends=[fasta_file, TrackedExecutable("usearch")], targets=output_file, args=min_size, name="usearch_sortbysize") return output_file
def filter_fastq(workflow, method, fastq_file, output_folder, threads, maxee, trunc_len, fastq_ascii, qmax=45): """ Filter the fastq files using the maxee value Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fastq_file (string): The path to the fastq file. output_folder (string): The path of the output folder. threads (int): The number of threads for each task. maxee (int): The maxee value to use for filtering. trunc_len (int): The value to use for max length. qmax (int): Max qvalue increased from the default of 43 to allow for Ion Torrent data Requires: usearch or vsearch Returns: string: A path to the filtered fasta file string: A path to the full fasta file """ # get the name of the final merged fastq file fasta_filtered_file = utilities.name_files( "all_samples_concatenated_filtered.fasta", output_folder) fasta_discarded_file = utilities.name_files( "all_samples_concatenated_discarded.fasta", output_folder) if method == "vsearch": workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; " + \ "vsearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]]", depends=[fastq_file, TrackedExecutable("vsearch")], targets=[fasta_filtered_file, fasta_discarded_file], args=[threads, maxee, trunc_len], name="vsearch_fastq_filter") else: workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; "+\ "usearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]] -fastq_qmax [args[3]] -fastq_ascii [args[4]]", depends=[fastq_file,TrackedExecutable("usearch")], targets=[fasta_filtered_file, fasta_discarded_file], args=[threads, maxee, trunc_len, qmax, fastq_ascii], name="usearch_fastq_filter") # create a fasta file of all reads (included the discarded fasta_file = utilities.name_files("all_samples_concatenated.fasta", output_folder) workflow.add_task("cat [depends[0]] [depends[1]] > [targets[0]]", depends=[fasta_filtered_file, fasta_discarded_file], targets=fasta_file) return fasta_filtered_file, fasta_file
def global_alignment(workflow, method, fasta_file, database_file, id, threads, output_file_uc, output_file_tsv, top_hit_only=None): """ Run global alignment with the database provided Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearch(default) or vsearch fasta_file (string): The path to the fasta file (filtered and dereplicated). database_file (string): Path to the database file (fasta or usearch format) id (float): The percent identity for alignment threads (int): The number of threads/cores for each task output_file_uc (string): The name for the uc output file output_file_tsv (string): The name for the tsv output file top_hit_only (bool): If set, only get the top hits. Requires: usearch or vsearch Returns: list: Path to the mapping results files """ optional_flags = "" # remove existing output file if already exists as clustalo will not overwrite if method == "vsearch": if top_hit_only: optional_flags = " -top_hits_only" workflow.add_task_gridable( "export OMP_NUM_THREADS=[args[0]]; " + \ "vsearch -usearch_global [depends[0]] -db [depends[1]] -strand plus -id [args[1]] -uc [targets[0]] -otutabout [targets[1]] -threads [args[0]]" + optional_flags, depends=[fasta_file, database_file, TrackedExecutable("vsearch")], targets=[output_file_uc, output_file_tsv], args=[threads, id], name="vsearch_global", time=60, # 60 minutes mem=2 * 1024, # 2 GB cores=threads) # time/mem based on 8 cores else: if top_hit_only: optional_flags = " -top_hit_only" workflow.add_task_gridable( "export OMP_NUM_THREADS=[args[0]]; "+\ "usearch -usearch_global [depends[0]] -db [depends[1]] -strand 'both' -id [args[1]] -uc [targets[0]] -otutabout [targets[1]] -threads [args[0]]"+optional_flags, depends=[fasta_file, database_file, TrackedExecutable("usearch")], targets=[output_file_uc, output_file_tsv], args=[threads, id], name="usearch_global", time=60, # 60 minutes mem=2*1024, # 2 GB cores=threads) # time/mem based on 8 cores
def picrust(workflow, otu_table_biom, output_folder): """ Runs picrust normalize, then predict Args: workflow (anadama2.workflow): An instance of the workflow class. out_table_biom (string): The path to the biom file (closed reference otu table). output_folder (string): The path of the output folder. Requires: Picrust v1.1: Software to predict metagenome function. Returns: string: The path to the functional data file in biom format. """ # normalize the otu table normalized_otu_table = utilities.name_files( "all_samples_normalize_by_copy_number.biom", output_folder) # first remove target file as picrust will not overwrite # expects biom file is json (not hdf5) format workflow.add_task( "remove_if_exists.py [targets[0]] ; "+\ "normalize_by_copy_number.py -i [depends[0]] -o [targets[0]]", depends=[otu_table_biom,TrackedExecutable("normalize_by_copy_number.py")], targets=normalized_otu_table, name="normalize_by_copy_number.py") # predict metagenomes predict_metagenomes_table = utilities.name_files( "all_samples_predict_metagenomes.biom", output_folder) # first remove target file as picrust will not overwrite workflow.add_task( "remove_if_exists.py [targets[0]] ; "+\ "predict_metagenomes.py -i [depends[0]] -o [targets[0]]", depends=[normalized_otu_table,TrackedExecutable("predict_metagenomes.py")], targets=predict_metagenomes_table, name="predict_metagenomes.py") # categorize by function categorized_function_table = utilities.name_files( "all_samples_categorize_by_function.biom", output_folder) # first remove target file as picrust will not overwrite workflow.add_task( "remove_if_exists.py [targets[0]] ; " + \ "categorize_by_function.py -i [depends[0]] -o [targets[0]] --level 3 -c KEGG_Pathways", depends=[predict_metagenomes_table, TrackedExecutable("categorize_by_function.py")], targets=categorized_function_table, name="categorize_by_function.py") return categorized_function_table, predict_metagenomes_table
def cluster_otus(workflow, method, fasta_file, reference_fasta, output_folder): """ Cluster the otus with usearch Args: workflow (anadama2.workflow): an instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fasta_file (string): the path to the fasta file (filtered and dereplicated). reference_fasta (string): the path to reference fasta db output_folder (string): the path of the output folder. Requires: usearch or vsearch Returns: list: Path to the fasta file sorted by size """ # get the name of the output files output_fasta = utilities.name_files("all_samples_otus_nonchimeras.fasta", output_folder) if method == "vsearch": output_txt = utilities.name_files("all_samples_vsearch_otus.txt", output_folder) all_otus = utilities.name_files("all_otus.fasta", output_folder) workflow.add_task( "vsearch --cluster_size [depends[0]] --consout [targets[0]] --id 0.97 --relabel 'OTU' --uc [targets[1]]", depends=[fasta_file, TrackedExecutable("vsearch")], targets=[all_otus, output_txt], name="vsearch_cluster_otus") workflow.add_task( "vsearch --uchime_ref [depends[0]] --nonchimeras [targets[0]] --strand plus --db [args[0]]", depends=[all_otus, TrackedExecutable("vsearch")], targets=[output_fasta], args=[reference_fasta], name="vsearch_nochim") else: output_txt = utilities.name_files("all_samples_uparse_otus.txt", output_folder) workflow.add_task( "usearch -cluster_otus [depends[0]] -otus [targets[0]] -relabel 'OTU' -uparseout [targets[1]]", depends=[fasta_file, TrackedExecutable("usearch")], targets=[output_fasta, output_txt], name="usearch_cluster_otus") return output_fasta
def centroid_alignment(workflow, fasta_file, output_fasta, threads, task_name=None): """ Run clustalo for centroid alignment Args: workflow (anadama2.workflow): An instance of the workflow class. fasta_file (string): The path to the fasta file (otu sequences). output_fasta (string): The path of the output file. threads (int): The number of threads/cores for each task. task_name (string): The custom name of the task. Requires: clustal omega: multiple sequence alignment for proteins Returns: string: Path to the clustered otu file """ # remove existing output file if already exists as clustalo will not overwrite workflow.add_task( "remove_if_exists.py [targets[0]] ; " "clustalo -i [depends[0]] -o [targets[0]] --threads [args[0]]", depends=[ fasta_file, TrackedExecutable( "clustalo", version_command="echo 'clustalo' `clustalo --version`") ], targets=output_fasta, args=threads, name=task_name if task_name else "clustalo")
def convert_from_biom_to_tsv(workflow, biom_file, tsv_file, table_type="OTU table", options=""): """ Convert from a biom file to a tsv file Args: workflow (anadama2.workflow): An instance of the workflow class. biom_file (string): The path to the biom file. tsv_file (string): The path to write the new tsv file. table_type (string): The type of table to convert options (string): Additional options to provide to the convert function Requires: Biom v2: A tool for general use formatting of biological data. """ # first remove biom file if exists as biom will not overwrite workflow.add_task( "remove_if_exists.py [targets[0]] ; "+\ "biom convert -i [depends[0]] -o [targets[0]] --table-type='"+table_type+"' --to-tsv "+options, depends=[biom_file,TrackedExecutable("biom")], targets=tsv_file, name="biom")
def optional_prioritization(workflow, prioritization_conf, interested_function, protein_family_ann, supervised_priority, output_folder, selected_priority): """ This set of tasks will run prioritization using functional annotations as optional fiters. Args: workflow (anadama2.workflow): An instance of the workflow class. prioritization_conf: Configuration file for quantitative prioritization. protein_family_ann: Finalized annotation file for protein . supervised_priority: Supervised prioritization file. Requires: config file annotation and quantitative prioritization files Returns: string: the name of annotation-based prioritization file. Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # annotation_based_prioritization tasks myselection = prioritization.optional_prioritization (workflow, args.prioritization_conf, protein_family_ann, supervised_priority, output_dir, selected_priority) # run the workflow workflow.go() """ config.logger.info("###### Start optional_prioritization module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config # get the clustering output files priority_dir = output_folder if not os.path.exists(priority_dir): os.system("mkdir -p " + priority_dir) # run annotation-based prioritization mylog = re.sub(".tsv", ".log", selected_priority) workflow.add_task( "metawibele_filter_prioritization -c [depends[0]] -a [depends[1]] -p [depends[2]] -f [args[0]] -o [targets[0]] > [args[1]] 2>&1", depends=[ prioritization_conf, protein_family_ann, supervised_priority, TrackedExecutable("metawibele_filter_prioritization") ], targets=[selected_priority], args=[interested_function, mylog], cores=1, name="filter_prioritization") return selected_priority
def quality_report(workflow, method, fastq_file, output_folder, threads, qmax=45): """ Generate a qc report from the fastq file of all samples Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearch(default) or vsearch fastq_file (string): The path to the fastq file. output_folder (string): The path of the output folder. threads (int): The number of threads for each task. qmax (int): Max qvalue increased from the default of 43 to allow for Ion Torrent data Requires: usearch or vsearch Returns: string: A path to the qc report file """ # get the name of the final merged fastq file qc_file = files.SixteenS.path("eestats2", output_folder) if method == 'vsearch': workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; " + \ "vsearch -fastq_eestats2 [depends[0]] -output [targets[0]] -threads [args[0]]", depends=[fastq_file, TrackedExecutable("vsearch")], targets=qc_file, args=threads, name="vsearch_fastq_eestats2") else: workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; "+\ "usearch -fastq_eestats2 [depends[0]] -output [targets[0]] -threads [args[0]] -fastq_qmax [args[1]]", depends=[fastq_file,TrackedExecutable("usearch")], targets=qc_file, args=[threads,qmax], name="usearch_fastq_eestats2") return qc_file
def dereplicate(workflow, method, fasta_file, output_folder, threads): """ Dereplicate reads Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fasta_file (string): The path to the fasta file (filtered and dereplicated). output_folder (string): The path of the output folder. threads (int): The number of threads for each task. Requires: usearch or vsearch Returns: list: Path to the dereplicated fasta file """ # get the name of the output files output_file = utilities.name_files("all_samples_dereplicated.fasta", output_folder) if method == "vsearch": workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; " + \ "vsearch --derep_fulllength [depends[0]] --output [targets[0]] --sizein --sizeout --threads [args[0]]", depends=[fasta_file, TrackedExecutable("vsearch")], targets=output_file, args=threads, name="vsearch_derep_fulllength") else: workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; "+\ "usearch -derep_fulllength [depends[0]] -fastaout [targets[0]] -sizeout -threads [args[0]]", depends=[fasta_file,TrackedExecutable("usearch")], targets=output_file, args=threads, name="usearch_derep_fulllength") return output_file
def const_seq_table(workflow, output_folder, filtered_dir, mergers_file_path, threads): """ Builds ASV table, removes chimeras, creates read counts at each step, and fasta file with all sequences Args: workflow (anadama2.workflow): an instance of the workflow class output_folder (string): path to output folder filtered_dir (string): path to directory with filtered files mergers_file_path (string): path to rds file that contains merged reads threads (int): number of threads Requires: dada2, tools, seqinr r packages Returns: string: path to rds file that contains ASV data string: path to read counts at each step tsv file string: path to fasta file with all sequences """ read_counts_steps_path = files.SixteenS.path("counts_each_step", output_folder) seqtab_file_path = os.path.join(output_folder, "seqtab_final.rds") seqs_fasta_path = os.path.join(output_folder, "sequences.fasta") readcounts_rds = "Read_counts_filt.rds" asv_tsv = "all_samples_SV_counts.tsv" script_path = utilities.get_package_file("const_seq_table", "Rscript") version_script = utilities.get_package_file("dada2_version", "Rscript") version_command = """echo 'r' `r -e 'packageVersion("dada2")' | grep -C 1 dada2`""" workflow.add_task( "[vars[0]] \ --output_dir=[args[0]]\ --filtered_dir=[args[1]]\ --merged_file_path=[depends[0]]\ --read_counts_steps_path=[targets[0]]\ --readcounts_rds=[vars[2]]\ --asv_tsv=[vars[3]]\ --seqtab_file_path=[targets[1]]\ --seqs_fasta_path=[targets[2]]\ --threads=[vars[1]]", depends = [mergers_file_path,TrackedExecutable("R", version_command="echo '" + version_script + "' `" + version_script + "`")], targets = [read_counts_steps_path, seqtab_file_path, seqs_fasta_path], args = [output_folder, filtered_dir], vars = [script_path, threads, readcounts_rds, asv_tsv ], name = "construct_sequence_table" ) return seqtab_file_path, read_counts_steps_path, seqs_fasta_path
def assembly(workflow, input_dir, extension, extension_paired, threads, output_folder, contigs): """ This set of tasks will run assembly on the input files provided. Args: workflow (anadama2.workflow): An instance of the workflow class. input_dir: The direcory path of fastq files. extension: The extension for all reads files, e.g. .fastq.gz extension_paired: The extension for paired reads, e.g. _R1.fastq.gz,_R2.fastq.gz threads (int): The number of threads/cores for clustering to use. output_folder (string): The path of the output folder. contigs: The summarized contig file. Requires: metahit v1.1.3: A program for assembling metagenomic sequencing reads fastq files Returns: string: the name of contigs file. Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add assembly tasks mycontigs = preprocessing_tasks.assembly (workflow, input_dir, args.sample_file, args.extension_paired, args.extension_orphan, args.threads, assembly_dir, contigs) # run the workflow workflow.go() """ config.logger.info("###### Start assembly module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config # ================================================ # collect sequences # ================================================ pair_identifier = None pair_identifier2 = None if extension_paired: extension_paireds = extension_paired.split(",") pair_identifier = re.sub(extension, "", extension_paireds[0]) pair_identifier2 = re.sub("1", "2", pair_identifier) sample_files = utilities.find_files(input_dir, extension_paireds[0], None) samples = utilities.sample_names(sample_files, extension_paireds[0], None) else: extension_paireds = [extension] sample_files = utilities.find_files(input_dir, extension, None) samples = utilities.sample_names(sample_files, extension, None) split_dir = input_dir assembly_dir = output_folder split_files = [] contigs_list = [] for sample in samples: mypair = "none" myorphan = "none" mypair_tmp = [] for item in extension_paireds: if item == "none": continue myfile = os.path.join(split_dir, sample + item) if os.path.isfile(myfile): mypair_tmp.append(myfile) else: sys.exit("File not exist! " + myfile) if len(mypair_tmp) == 1: # split into paired reads files mypair_tmp = utilities.split_paired_reads(mypair_tmp[0], extension, pair_identifier) if len(mypair_tmp) == 1: myorphan = mypair_tmp[0] if len(mypair_tmp) == 2: mypair = ",".join(mypair_tmp) if len(mypair_tmp) == 3: mypair = ",".join(mypair_tmp[0:2]) myorphan = mypair_tmp[2] else: if len(mypair_tmp) == 2: mypair = ",".join(mypair_tmp) if len(mypair_tmp) == 3: tmp1 = [] tmp2 = [] for i in mypair_tmp: if re.search(pair_identifier, i): tmp1.append(i) elif re.search(pair_identifier2, i): tmp1.append(i) else: tmp2.append(i) if len(tmp1) > 0: mypair = ",".join(tmp1) if len(tmp2) > 0: myorphan = ",".join(tmp2) split_files.append((sample, mypair, myorphan)) seq_base = sample megahit_contig_dir = os.path.join(assembly_dir, seq_base) megahit_contig = os.path.join(megahit_contig_dir, '%s.contigs.fa' % seq_base) contigs_list.append(megahit_contig) ## run MEGAHIT os.system("mkdir -p " + assembly_dir) for (sample, mypair, myorphan) in split_files: seq_base = sample megahit_contig_dir = os.path.join(assembly_dir, seq_base) megahit_contig = os.path.join(megahit_contig_dir, '%s.contigs.fa' % seq_base) ## MEGAHIT needs memory in a byte format so let's take care of data #time_equation = "24*60 if file_size('[depends[0]]') < 25 else 6*24*60" # 24 hours or more depending on file size #mem_equation = "32*1024 if file_size('[depends[0]]') < 25 else 3*32*1024" # 32 GB or more depending on file size mylog = os.path.join(assembly_dir, '%s.log' % seq_base) if mypair != "none": tmp = mypair.split(",") if len(tmp) == 2: # paired reads: tmp = mypair.split(",") f_seq = tmp[0] r_seq = tmp[1] if myorphan != "none": workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -1 [depends[0]] -2 [depends[1]] -r [args[2]] -t [args[0]] -o [args[3]] --out-prefix [args[1]] >[args[4]] 2>&1", depends=[f_seq, r_seq, TrackedExecutable("megahit")], targets=[megahit_contig], args=[ threads, seq_base, myorphan, megahit_contig_dir, mylog ], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") else: workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -1 [depends[0]] -2 [depends[1]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1", depends=[f_seq, r_seq, TrackedExecutable("megahit")], targets=[megahit_contig], args=[threads, seq_base, megahit_contig_dir, mylog], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") else: workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1", depends=[mypair, TrackedExecutable("megahit")], targets=[megahit_contig], args=[threads, seq_base, megahit_contig_dir, mylog], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") else: if myorphan != "none": workflow.add_task_gridable( "rm -rf " + megahit_contig_dir + " && " + "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1", depends=[myorphan, TrackedExecutable("megahit")], targets=[megahit_contig], args=[threads, seq_base, megahit_contig_dir, mylog], cores=threads, mem=mem_equation, time=time_equation, name=sample + "__megahit") for myfile in contigs_list: myname = os.path.basename(myfile) myfile_new = os.path.join(assembly_dir, myname) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) ## combine contigs sequences mylog = contigs + ".log" workflow.add_task( "metawibele_format_contig_sequences -p [args[0]] -e contigs.fa -o [targets[0]] > [args[1]] 2>&1", depends=utilities.add_to_list( contigs_list, TrackedExecutable("metawibele_format_contig_sequences")), targets=[contigs], args=[assembly_dir, mylog], cores=1, name="format_contig_table") return contigs_list
def gene_catalog(workflow, complete_gene, complete_protein, input_dir, extension, extension_paired, threads, prefix_gene_catalog, gene_catalog, gene_catalog_nuc, gene_catalog_prot, mapping_dir, gene_catalog_saf, gene_catalog_count): """ This set of tasks will build gene catalogs. Args: workflow (anadama2.workflow): An instance of the workflow class. complete_gene: The fasta file of gene nucleotide sequences for complete ORFs. complete_protein: The fasta file of protein sequences for complete ORFs. mapping_dir: The direcory path of mapping results. prefix_gene_catalog: The prefix of gene catalog file. gene_catalog: The gene catalog file. gene_catalog_nuc: The fastq file of nucleotide sequences for gene catalogs. gene_catalog_prot: The fastq file of protein sequences for gene catalogs. gene_catalog_saf: The SAF gtf file for gene catalogs. gene_catalog_count: The count file for gene catalogs. Requires: bowtie2 (tested with 2.3.2) samtools (tested with 1.5) featureCounts (tested with Version 1.6.2) the nucleotide and amino acid sequences for gene catalogs fastq files for each sample Returns: string: file names of gene catalogs Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add quality control tasks for the fastq files mygene_catalog, mycounts = preprocessing_tasks.gene_catalogs (workflow, complete_gene, complete_protein, mapping_dir, prefix_gene_catalog, gene_catalog, gene_catalog_nuc, gene_catalog_prot, gene_catalog_saf, gene_catalog_count) # run the workflow workflow.go() """ config.logger.info("###### Start gene_catalog module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config ### run gene-catalog workflow mylog = gene_catalog_nuc + ".log" myclust = gene_catalog_nuc + ".clstr" workflow.add_task( 'cd-hit-est -i [depends[0]] [args[0]] -o [targets[0]] >[args[1]] 2>&1 ', depends=[complete_gene, TrackedExecutable("cd-hit-est")], targets=[gene_catalog_nuc, myclust], args=[config.cd_hit_gene_opts, mylog], cores=threads, name="cd-hit-est") mylog = gene_catalog + ".log" workflow.add_task( 'metawibele_extract_cluster -c [depends[0]] -o [targets[0]] >[args[0]] 2>&1 ', depends=[myclust, TrackedExecutable("metawibele_extract_cluster")], targets=[gene_catalog], args=[mylog], cores=1, name="extract_cluster_CD-hit") mylog = gene_catalog_prot + ".log" workflow.add_task( 'metawibele_extract_non_redundance_seq -r [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1 ', depends=[ gene_catalog_nuc, complete_protein, TrackedExecutable("metawibele_extract_non_redundance_seq") ], targets=[gene_catalog_prot], args=[mylog], cores=1, name="extract_non_redundance_seq") ### get the abundance of gene catalog # run gene-abundance workflow mylog = gene_catalog_saf + ".log" workflow.add_task( 'metawibele_gene_abundance_indexRef -r [depends[0]] -t gene -b [args[0]] -o [targets[0]] >[args[1]] 2>&1 ', depends=[ gene_catalog_nuc, TrackedExecutable("metawibele_gene_abundance_indexRef") ], targets=[gene_catalog_saf], args=[prefix_gene_catalog, mylog], cores=1, name="gene_abundance_indexRef") ## collect sequences if extension_paired: extension_paireds = extension_paired.split(",") sample_files = utilities.find_files(input_dir, extension_paireds[0], None) samples = utilities.sample_names(sample_files, extension_paireds[0], None) else: sample_files = utilities.find_files(input_dir, extension, None) samples = utilities.sample_names(sample_files, extension, None) ## bowtie2 will map reads to gene categories flt_seqs = [] for sample in samples: seq_file = "NA" if extension_paired: tmp = extension_paired.split(",") else: if extension != "none": tmp = extension.split(",") for item in tmp: if seq_file == "NA": seq_file = os.path.join(input_dir, sample + '%s' % item) else: seq_file = seq_file + "," + os.path.join( input_dir, sample + '%s' % item) flt_seqs.append((sample, seq_file)) # foreah sample ## Now run bowtie2 to map reads to gene categories mappings = [] mappings_tmp = [] #mem_equation = "2*12*1024 if file_size('[depends[0]]') < 10 else 4*12*1024" #time_equation = "2*60 if file_size('[depends[0]]') < 10 else 2*2*60" for (sample, seq_file) in flt_seqs: seq_base = sample mydir = os.path.join(mapping_dir, sample) os.system("mkdir -p " + mydir) sample_counts = os.path.join(mydir, seq_base + ".sort.bed") stdout_log = os.path.join(mydir, '%s.mapping.stdout.log' % seq_base) mappings_tmp.append(sample_counts) workflow.add_task( 'metawibele_gene_abundance -r [depends[0]] -u [args[0]] -t [args[1]] -s [args[2]] -w [args[3]] ' '> [args[4]] 2>&1 ', depends=[ gene_catalog_nuc, gene_catalog_saf, TrackedExecutable("metawibele_gene_abundance") ], targets=[sample_counts], args=[seq_file, threads, seq_base, mydir, stdout_log], cores=1, name=sample + "__gene_abundance") for myfile in mappings_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(mapping_dir, myname) mappings.append(myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) # collect abundance mylog = gene_catalog_count + ".log" workflow.add_task( 'metawibele_gene_catalog_abundance -p [args[0]] -s sort.bed -c [args[1]] -o [targets[0]] >[args[2]] 2>&1 ', depends=utilities.add_to_list( mappings, TrackedExecutable("metawibele_gene_catalog_abundance")), targets=[gene_catalog_count], args=[mapping_dir, gene_catalog, mylog], cores=1, name="gene_catalog_abundance") return gene_catalog, gene_catalog_count
def demultiplex(workflow, input_files, extension, output_folder, barcode_file, index_files, min_phred, pair_identifier): """Demultiplex the files (single end or paired) Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq files for input to ea-utils. extension (string): The extension for all files. output_folder (string): The path of the output folder. barcode_file (string): A file of barcodes. index_files (string): A list of paths to the index files. min_phred (int): The min phred quality score to use in the demultiplex command. pair_identifier (string): The string in the file basename to identify the first pair in the set. Requires: ea-utils fastq-multx: A tool to demultiplex fastq files. Returns: list: A list of the demultiplexed files string: output folder of demultiplexed files """ # error if there is more than one index file if len(index_files) > 1: sys.exit("ERROR: Only one index file expected for demultiplexing step.") # read the barcode file to get the expected output files try: file_handle=open(barcode_file) lines=file_handle.readlines() file_handle.close() except EnvironmentError: sys.exit("ERROR: Unable to read barcode file: " + barcode_file) samples=set() for line in lines: # ignore headers or comment lines if not line.startswith("#"): sample_name=line.rstrip().split("\t")[0] if sample_name: samples.add(sample_name) # get the names of the expected output files demultiplex_fastq_files = utilities.name_files(samples,output_folder,subfolder="demultiplex",extension="fastq") # name the barcode file with the reverse complement barcodes added expanded_barcode_file = utilities.name_files("expanded_barcode_file.txt",output_folder,subfolder="demultiplex",create_folder=True) # create a file that includes the reverse complements of the barcodes workflow.add_task( "reverse_compliment_barcodes.py --input [depends[0]] --output [targets[0]]", depends=barcode_file, targets=expanded_barcode_file) # check for paired input files input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier) # capture the demultiplex stats in output files, one for each set of input files if input_pair1: demultiplex_log = utilities.name_files(input_pair1[0],output_folder,subfolder="demultiplex",extension="log") else: demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log") # get the output folder for all files demultiplex_output_folder = os.path.dirname(demultiplex_log) # get the basenames of the output files, one for each sample demultiplex_output_basenames = utilities.name_files(samples,output_folder,subfolder="demultiplex") # create a tracked executable fastq_multx_tracked = TrackedExecutable("fastq-multx",version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`") if input_pair1 and input_pair2: # this run has paired input files # get the second pair identifier pair_identifier2=pair_identifier.replace("1","2",1) # get the names of the expected output files demultiplex_fastq_files_R1 = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames] demultiplex_fastq_files_R2 = [file+pair_identifier2+".fastq" for file in demultiplex_output_basenames] demultiplex_fastq_files = demultiplex_fastq_files_R1+demultiplex_fastq_files_R2 if index_files: # this run has index files workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] [depends[3]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, index_files[0], input_pair1[0], input_pair2[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2], targets=demultiplex_log, name="demultiplex") else: workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, input_pair1[0], input_pair2[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2], targets=demultiplex_log, name="demultiplex") else: # this run has single end input files # get the names of the expected output files demultiplex_fastq_files = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames] if index_files: # this run has index files workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, index_files[0], input_files[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier], targets=demultiplex_log, name="demultiplex") else: workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] -o [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, input_files[0]], args=[min_phred, demultiplex_output_folder, pair_identifier, fastq_multx_tracked], targets=demultiplex_log, name="demultiplex") demultiplex_fastq_files = demultiplex_check(workflow, demultiplex_log, demultiplex_fastq_files) return demultiplex_fastq_files, demultiplex_output_folder
def gene_calling(workflow, assembly_dir, assembly_extentsion, input_dir, extension, extension_paired, gene_call_type, prokka_dir, prodigal_dir, threads, gene_file, gene_PC_file, protein_file, protein_sort, gene_info, complete_gene, complete_protein): """ This set of tasks will run gene-calling workflow. Args: workflow (anadama2.workflow): An instance of the workflow class. assembly_dir: The direcory path of assembly results. sample_file: The sample list file. prokka_dir: The direcory path of prokka results. prodigal_dir: The direcory path of prodigal results. gene_file: The fasta file of gene nucleotide sequences. gene_PC_file: The fasta file of protein coding gene nucleotide sequences. protein_file: The fasta file of protein sequences. protein_sort: The sorted fasta file of protein sequences. gene_info: The summaized gene calling file. complete_gene: The fasta file of gene nucleotide sequences for complete ORFs. complete_protein: The fasta file of protein sequences for complete ORFs. Requires: prokka 1.14-dev: rapid prokaryotic genome annotation (recommend to close '-c' parameter in prodigal) prodigal v2.6: gene prediction usearch (tested with usearch v9.0.2132_i86linux64) assembled contig files Returns: string: name of gene files Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add gene calling tasks mygene, myprotein = preprocessing_tasks.gene_calling (workflow, assembly_dir, args.sample_file, prokka_dir, prodigal_dir, gene_file, gene_PC_file, protein_file, protein_sort, gene_info, complete_gene, complete_protein) # run the workflow workflow.go() """ config.logger.info("###### Start gene_calling module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config # ================================================ # collect sequences # ================================================ if extension_paired: extension_paireds = extension_paired.split(",") sample_files = utilities.find_files(input_dir, extension_paireds[0], None) samples = utilities.sample_names(sample_files, extension_paireds[0], None) else: sample_files = utilities.find_files(input_dir, extension, None) samples = utilities.sample_names(sample_files, extension, None) sequence_files = [] for mysample in samples: myfile = os.path.join(assembly_dir, mysample, mysample + "%s" % assembly_extentsion) sequence_files.append(myfile) # foreach sample filtered_contigs = sequence_files # ================================================ # Gene calling # ================================================ fna_file = [] faa_file = [] gff_files = [] fna_file_tmp = [] faa_file_tmp = [] gff_files_tmp = [] ## Using Prodigal if gene_call_type == "prodigal" or gene_call_type == "both": os.system("mkdir -p " + prodigal_dir) for contig in filtered_contigs: contig_base = os.path.basename(contig).split(os.extsep)[0] annotation_dir = os.path.join(prodigal_dir, contig_base) os.system("mkdir -p " + annotation_dir) gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base) cds_file = os.path.join(annotation_dir, '%s.fna' % contig_base) cds_aa = os.path.join(annotation_dir, '%s.faa' % contig_base) score = os.path.join(annotation_dir, '%s.gene_score.txt' % contig_base) stdout_log = os.path.join(annotation_dir, '%s.stdout.log' % contig_base) faa_file_tmp.append(cds_aa) workflow.add_task_gridable( 'prodigal -m -p meta -i [depends[0]] ' '-f gff -o [targets[0]] -d [targets[1]] -s [targets[3]] ' '-a [targets[2]] ' '>[args[0]] 2>&1', depends=[contig, TrackedExecutable("prodigal")], targets=[gff_file, cds_file, cds_aa, score], args=[stdout_log], cores=threads, mem=mem_equation, time=time_equation, name=contig_base + "__prodigal") for myfile in faa_file_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(prodigal_dir, myname) faa_file.append(myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) myfna = re.sub(".faa", ".fna", myfile) myfna_new = re.sub(".faa", ".fna", myfile_new) if gene_call_type == "prodigal": fna_file.append(myfna_new) mygff_new = re.sub(".faa", ".gff", myfile_new) gff_files.append(mygff_new) prokka_dir = prodigal_dir workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfna], targets=[myfna_new], cores=1, name="ln__" + os.path.basename(myfna)) mygff = re.sub(".faa", ".gff", myfile) mygff_new = re.sub(".faa", ".gff", myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[mygff], targets=[mygff_new], cores=1, name="ln__" + os.path.basename(mygff)) if gene_call_type == "prokka" or gene_call_type == "both": ## Calling genes with Prokka os.system("mkdir -p " + prokka_dir) for contig in filtered_contigs: contig_base = os.path.basename(contig).split(os.extsep)[0] sample = os.path.basename(contig_base) annotation_dir = os.path.join(prokka_dir, sample) os.system("mkdir -p " + annotation_dir) stdout_log = os.path.join( annotation_dir, '%s.prokka.bacteria.stdout.log' % contig_base) score = os.path.join(annotation_dir, '%s.gene_score.txt' % contig_base) gene_nuc = os.path.join(annotation_dir, '%s.ffn' % contig_base) gene_aa = os.path.join(annotation_dir, '%s.faa' % contig_base) gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base) fna_file_tmp.append(gene_nuc) gff_files_tmp.append(gff_file) workflow.add_task_gridable( 'prokka --prefix [args[0]] --addgenes --addmrna --force --metagenome ' '--cpus [args[2]] ' '--outdir [args[1]] [depends[0]] ' '>[args[3]] 2>&1 ', depends=[contig, TrackedExecutable("prokka")], targets=[gene_nuc, gene_aa, gff_file], args=[sample, annotation_dir, threads, stdout_log], cores=threads, mem=mem_equation, time=time_equation, name=contig_base + "__prokka") for myfile in gff_files_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(prokka_dir, myname) gff_files.append(myfile_new) for myfile in fna_file_tmp: myname = os.path.basename(myfile) myfile_new = os.path.join(prokka_dir, myname) fna_file.append(myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfile], targets=[myfile_new], cores=1, name="ln__" + myname) myfaa = re.sub(".ffn", ".faa", myfile) myfaa_new = re.sub(".ffn", ".faa", myfile_new) if gene_call_type == "prokka": faa_file.append(myfaa_new) prodigal_dir = prokka_dir workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[myfaa], targets=[myfaa_new], cores=1, name="ln__" + os.path.basename(myfaa)) mygff = re.sub(".ffn", ".gff", myfile) mygff_new = re.sub(".ffn", ".gff", myfile_new) workflow.add_task("ln -fs [depends[0]] [targets[0]]", depends=[mygff], targets=[mygff_new], cores=1, name="ln__" + os.path.basename(mygff)) # ================================================ # Summarize sequences # ================================================ #mem_equation = "50000" ### combine gene sequences ### nuc_type = "ffn" if gene_call_type == "prodigal": nuc_type = "fna" mylog = re.sub(".fna", ".log", gene_file) workflow.add_task( 'metawibele_combine_gene_sequences -p [args[0]] -e [args[1]] -o [targets[0]] > [args[2]] 2>&1 ', depends=utilities.add_to_list( fna_file, TrackedExecutable("metawibele_combine_gene_sequences")) + fna_file_tmp + gff_files + gff_files_tmp, targets=[gene_file], args=[prokka_dir, nuc_type, mylog], cores=1, name="combine_gene_sequences") ### combine protein sequences ### ## collect sequences mylog = re.sub(".faa", ".log", protein_file) workflow.add_task( 'metawibele_format_protein_sequences -p [args[0]] -q [args[1]] -e faa -o [targets[0]] ' '-m [targets[1]] >[args[2]] 2>&1 ', depends=utilities.add_to_list( faa_file, TrackedExecutable("metawibele_format_protein_sequences")) + faa_file_tmp + gff_files + gff_files_tmp, targets=[protein_file, gene_info], args=[prokka_dir, prodigal_dir, mylog], cores=1, name="format_protein_sequences") ## sort by length and filter out short-length sequence mylog = re.sub(".faa", ".log", protein_sort) workflow.add_task( 'usearch -sortbylength [depends[0]] ' '-fastaout [targets[0]] -minseqlength 0 >[args[0]] 2>&1 ', depends=[protein_file, TrackedExecutable("usearch")], targets=[protein_sort], args=[mylog], cores=1, name="usearch__sorting") ## extract nucleotide sequence for protein coding genes mylog = re.sub(".fna", ".log", gene_PC_file) workflow.add_task( 'metawibele_extract_protein_coding_genes -g [depends[0]] -p [depends[1]] -o [targets[0]] > [args[0]] 2>&1 ', depends=[ gene_file, protein_sort, TrackedExecutable("metawibele_extract_protein_coding_genes") ], targets=[gene_PC_file], args=[mylog], cores=1, name="extract_protein_coding_genes") ## extract sequences mylog = re.sub(".fna", ".log", complete_gene) workflow.add_task( 'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1', depends=[ gene_info, gene_PC_file, TrackedExecutable("metawibele_extract_complete_ORF_seq") ], targets=[complete_gene], args=[mylog], cores=1, name='extract_complete_ORF_seq') mylog = re.sub(".faa", ".log", complete_protein) workflow.add_task( 'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1', depends=[ gene_info, protein_sort, TrackedExecutable("metawibele_extract_complete_ORF_seq") ], targets=[complete_protein], args=[mylog], cores=1, name='extract_complete_ORF_seq') return complete_gene, complete_protein
def mandatory_prioritization(workflow, prioritization_conf, protein_family_ann, protein_family_attr, output_folder): """ This set of tasks will run prioritization using quantitative criteria. Args: workflow (anadama2.workflow): An instance of the workflow class. prioritization_conf: Configuration file for quantitative prioritization. protein_family_ann: Finalized annotation file for protein . protein_family_attr: Finalized attribue file for annotations. Requires: config file annotation files Returns: string: the name of prioritized file. Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add quantification_based_prioritization tasks myrank, mypriority = prioritization.mandatory_prioritization (workflow, args.prioritization_conf, protein_family_ann, protein_family_attr, output_dir) # run the workflow workflow.go() """ config.logger.info("###### Start mandatory_prioritization module ######") # get the clustering output files priority_dir = output_folder unsupervised_rank = os.path.join( priority_dir, config.basename + "_unsupervised_prioritization.rank.tsv") supervised_rank = os.path.join( priority_dir, config.basename + "_supervised_prioritization.rank.tsv") #unsupervised_priority = os.path.join(priority_dir, config.basename + "_unsupervised_prioritization.priority.tsv") #supervised_priority = os.path.join(priority_dir, config.basename + "_supervised_prioritization.priority.tsv") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config if not os.path.exists(priority_dir): os.system("mkdir -p " + priority_dir) # run unsupervised prioritization mylog = re.sub(".tsv", ".log", unsupervised_rank) workflow.add_task( "metawibele_quantify_prioritization -c [depends[0]] -m unsupervised -w fixed -a [depends[1]] -b [depends[2]] -o [args[0]] >[args[1]] 2>&1", depends=[ prioritization_conf, protein_family_ann, protein_family_attr, TrackedExecutable("metawibele_quantify_prioritization") ], targets=[unsupervised_rank], args=[priority_dir, mylog], cores=1, name="quantify_prioritization__unsupervised") # run supervised prioritization if not "".join(config.phenotype) == "none": mylog = re.sub(".tsv", ".log", supervised_rank) workflow.add_task( "metawibele_quantify_prioritization -c [depends[0]] -m supervised -w equal -a [depends[1]] -b [depends[2]] -o [args[0]] >[args[1]] 2>&1", depends=[ prioritization_conf, protein_family_ann, protein_family_attr, TrackedExecutable("metawibele_quantify_prioritization") ], targets=[supervised_rank], args=[priority_dir, mylog], cores=1, name="quantify_prioritization__supervised") return unsupervised_rank, supervised_rank
def demultiplex_dual(workflow, output_folder, input_files, extension, barcode_files, dual_barcode_path, min_phred, pair_identifier): """Demultiplex the files (dual indexed paired) Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq(gz) files for input to ea-utils. extension (string): The extension for all files. output_folder (string): The path of the output folder. barcode_files (list): A list of barcode files. dual_index_path (string): A paths to the dual index file. min_phred (int): The min phred quality score to use in the demultiplex command. pair_identifier (string): The string in the file basename to identify the first pair in the set. Requires: ea-utils fastq-multx: A tool to demultiplex fastq files. Returns: list: A list of the demultiplexed files string: output folder of demultiplexed files """ # capture the demultiplex stats in log file, one for each set of input files demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log",create_folder=True) demultiplex_output_folder = os.path.dirname(demultiplex_log) # create a tracked executable fastq_multx_tracked = TrackedExecutable("fastq-multx", version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`") # check for paired input files input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier) # get barcode files barcode1, barcode2 = utilities.paired_files(barcode_files, extension, pair_identifier) # get the second pair identifier pair_identifier2 = pair_identifier.replace("1", "2", 1) try: file_handle = open(dual_barcode_path) lines = file_handle.readlines() file_handle.close() except EnvironmentError: sys.exit("ERROR: Unable to read dual barcode file: " + dual_barcode_path) run_name = os.path.basename(input_pair1[0]).replace(pair_identifier, "").replace("." + extension, "") demultiplex_files = set() for line in lines: # ignore headers or comment lines if not line.startswith("#"): sample_name = line.split("\t")[0] if sample_name: nm1 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier + "." + extension nm2 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier2 + "." + extension demultiplex_files.add(nm1) demultiplex_files.add(nm2) # get the names of the expected output files # demultiplex_files = utilities.name_files(samples, demultiplex_output_folder, extension=extension) workflow.add_task( "fastq-multx -B [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]]\ -o n/a -o n/a -o [args[0]]/[args[5]]_%[args[3]].[args[1]] -o [args[0]]/[args[5]]_%[args[4]].[args[1]]\ -q [args[2]] > [targets[0]]", depends=[dual_barcode_path, barcode1[0], barcode2[0], input_pair1[0], input_pair2[0]], args=[demultiplex_output_folder, extension, min_phred, pair_identifier, pair_identifier2, run_name, fastq_multx_tracked], targets=[demultiplex_log, TrackedDirectory(demultiplex_output_folder)], name="demultiplex_dual") demultiplex_files = demultiplex_check(workflow, demultiplex_log, demultiplex_files) return demultiplex_files, demultiplex_output_folder
def finalize_prioritization(workflow, unsupervised_rank, selected_unsup_priority, supervised_rank, selected_priority, output_folder, final_unsupervised_rank, final_selected_unsup_priority, final_supervised_rank, final_selected_priority): """ This set of tasks will format prioritization files Args: workflow (anadama2.workflow): An instance of the workflow class. raw prioritized files finalized prioritized files Requires: raw prioritized files Example: from anadama2 import Workflow from MetaWIBELE.characterize import characterization # create an anadama2 workflow instance workflow=Workflow() # add quality control tasks for the fastq files finalize_prioritization (workflow, unsupervised_rank, supervised_rank, selected_priority, output_folder, final_unsupervised_rank, final_supervised_rank, final_selected_priority) # run the workflow workflow.go() """ config.logger.info("###### Start finalize_prioritization module ######") time_equation = config.time # xxx hours defined in global config mem_equation = config.memory # xxx GB defined in global config priority_dir = output_folder if not os.path.exists(priority_dir): os.system("mkdir -p " + priority_dir) # format prioritization mylog = re.sub(".tsv", ".log", final_unsupervised_rank) workflow.add_task( "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1", depends=[ unsupervised_rank, TrackedExecutable("metawibele_finalize_prioritization") ], targets=[final_unsupervised_rank], args=[mylog], cores=1, name="finalize_prioritization__unsupervised_rank") mylog = re.sub(".tsv", ".log", final_selected_unsup_priority) workflow.add_task( "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1", depends=[ selected_unsup_priority, TrackedExecutable("metawibele_finalize_prioritization") ], targets=[final_selected_unsup_priority], args=[mylog], cores=1, name="finalize_prioritization__selected_unsupervised_priority") if not "".join(config.phenotype) == "none": mylog = re.sub(".tsv", ".log", final_supervised_rank) workflow.add_task( "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1", depends=[ supervised_rank, TrackedExecutable("metawibele_finalize_prioritization") ], targets=[final_supervised_rank], args=[mylog], cores=1, name="finalize_prioritization__supervised_rank") mylog = re.sub(".tsv", ".log", final_selected_priority) workflow.add_task( "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1", depends=[ selected_priority, TrackedExecutable("metawibele_finalize_prioritization") ], targets=[final_selected_priority], args=[mylog], cores=1, name="finalize_prioritization__selected_supervised_priority")
def merge_pairs_and_rename(workflow, method, input_files, extension, output_folder, pair_identifier, threads, fastq_ascii): """ Merge the files if pairs and rename sequence ids to match sample id Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis, usearch default or vsearch input_files (list): A list of paths to fastq files. extension (string): The extension for all files. output_folder (string): The path of the output folder. pair_identifier (string): The string in the file basename to identify the first pair in the set. threads (int): The number of threads for each task. Requires: usearch or vsearch Returns: list: A list of the renamed files. """ pair1, pair2 = utilities.paired_files(input_files, extension, pair_identifier) if pair1 and pair2: # paired input files were found # if the files are gzipped, first decompress as fastq_mergepairs will take in fastq.gz but the output will not be correctly formatted if pair1[0].endswith(".gz"): # get the names of the decompressed output files decompressed_pair1 = utilities.name_files( [os.path.basename(file).replace(".gz", "") for file in pair1], output_folder, subfolder="merged_renamed") # get the names of the decompressed output files decompressed_pair2 = utilities.name_files( [os.path.basename(file).replace(".gz", "") for file in pair2], output_folder, subfolder="merged_renamed") # add tasks to decompress the files workflow.add_task_group("gunzip -c [depends[0]] > [targets[0]]", depends=pair1 + pair2, targets=decompressed_pair1 + decompressed_pair2) # the pair files to be used for the remaining tasks are those that are decompressed pair1 = decompressed_pair1 pair2 = decompressed_pair2 # get the sample names from the input file names sample_names = [ os.path.basename(file).replace(pair_identifier + ".fastq", "") for file in pair1 ] # get the names of the output files stitched_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="stitched", extension="fastq", create_folder=True) unjoined_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="unjoined", extension="fastq") # run usearch to merge pairs, if input files are non-empty for read1, read2, stitched_output, unjoined_output in zip( pair1, pair2, stitched_files, unjoined_files): if method == 'vsearch': workflow.add_task( utilities.partial_function(merge_pairs, method="vsearch", threads=threads), depends=[read1, read2, TrackedExecutable("vsearch")], targets=[stitched_output, unjoined_output], name="vsearch_fastq_mergepairs") else: workflow.add_task( utilities.partial_function(merge_pairs, method="userach", threads=threads, fastq_ascii=fastq_ascii), depends=[read1, read2, TrackedExecutable("usearch")], targets=[stitched_output, unjoined_output], name="usearch_fastq_mergepairs") # merge the stitched and unjoined from the prior step renamed_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="renamed", extension="fastq") workflow.add_task_group( "merge_and_rename_fastq.py [depends[0]] [depends[1]] _stitched [targets[0]]", depends=zip(stitched_files, unjoined_files), targets=renamed_files) else: # these files are not pairs and do not need to be merged # rename the files renamed_files = utilities.name_files(input_files, output_folder, subfolder="merged_renamed", tag="renamed", extension="fastq", create_folder=True) workflow.add_task_group( "merge_and_rename_fastq.py [depends[0]] '' '' [targets[0]]", depends=input_files, targets=renamed_files) return renamed_files
utilities.name_files([ name + ".trimmed.1.fastq", name + ".trimmed.2.fastq", name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq", name + ".trimmed.single.12.fastq" ], args.output, subfolder="kneaddata", create_folder=True) for name in sample_names ] paired = True for target_set, input_R1, input_R2, name in zip(qc_targets, input_pair1, input_pair2, sample_names): workflow.add_task( "kneaddata --run-fastqc-start --input [depends[0]] --input [depends[1]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]] && cat [args[3]] [args[4]] > [targets[2]]", depends=[input_R1, input_R2, TrackedExecutable("kneaddata")], targets=[target_set[0], target_set[1], target_set[4]], args=[ os.path.dirname(target_set[0]), args.threads, name, target_set[2], target_set[3] ]) else: qc_targets = utilities.name_files(sample_names, args.output, subfolder="kneaddata", create_folder=True, extension="trimmed.fastq") for target_file, input_file, name in zip(qc_targets, input_files, sample_names): workflow.add_task( "kneaddata --run-fastqc-start --input [depends[0]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]]",
# Parsing the workflow arguments args = workflow.parse_args() #Loading the config setting args.config = 'etc/config.ini' # AnADAMA2 example workflow.do workflow.do("ls /usr/bin/ | sort > [t:output/global_exe.txt]") #Command workflow.do("ls $HOME/.local/bin/ | sort > [t:output/local_exe.txt]") #Command # Task0 sample python analysis module - src/trim.py workflow.add_task( "src/trim.py --lines [args[0]] --output [targets[0]] --input " + args.input, #Command depends=[TrackedExecutable("src/trim.py") ], #Tracking executable dependencies targets=args.output, #Output target directory args=[args.lines]) #Additional arguments # Task1 sample python visualization module - src/plot.py workflow.add_task( "src/plot.py --output [targets[0]] --input " + args.input, #Command depends=[TrackedExecutable("src/plot.py") ], #Tracking executable dependencies targets=args.output) #Output target directory # Task2 sample R module - src/analysis_example.r workflow.add_task( "src/analysis.R -o [targets[0]] -d " + args.metadata, #Command depends=[TrackedExecutable("src/analysis.R")
def remove_primers(workflow,fwd_primer,rev_primer,input_folder,output_folder,pair_id,threads): """ Identifies primers and N filters samples Args: workflow (anadama2.workflow): an instance of the workflow class input_folder (string): path to input folder output_folder (string): path to output folder fwd_primer (string): forward primer rev_primer (string): reverse primer pair_id (string): pair identifier threads (string): number of threads Requires: dada2, Biostrings, ShortRead, tools r packages Returns: string: path to folder with primers removed files """ script_path = utilities.get_package_file("identify_primers", "Rscript") filtN_folder = os.path.join(output_folder,"filtN") primers_folder = os.path.join(output_folder,"primers") fwd_primer_file = os.path.join(primers_folder,"fwd_primer_file.txt") rev_primer_file = os.path.join(primers_folder,"rev_primer_file.txt") cutadapt_folder = os.path.join(output_folder, "cutadapt") # run identify primers task workflow.add_task( "[vars[0]] \ --input_dir=[args[3]] \ --filtn_dir=[vars[1]] \ --primers_dir=[vars[2]] \ --threads=[args[4]] \ --fwd_primer_file=[targets[0]] \ --rev_primer_file=[targets[1]] \ --fwd_primer=[args[0]] \ --rev_primer=[args[1]] \ --pair_id=[args[2]]", targets=[fwd_primer_file,rev_primer_file, TrackedDirectory(filtN_folder)], args=[fwd_primer, rev_primer, pair_id,input_folder,threads], vars=[script_path,filtN_folder,primers_folder,output_folder], name="identify_primers" ) pair_id2 = pair_id.replace("1", "2",1) fwd_files = sorted(fnmatch.filter(os.listdir(input_folder), "*"+pair_id+"*.fastq*")) rev_files = sorted(fnmatch.filter(os.listdir(input_folder), "*" + pair_id2 + "*.fastq*")) #run cutadapt to remove primers for i in range(0,len(fwd_files)): fwd_file=os.path.join(input_folder,fwd_files[i]) rev_file = os.path.join(input_folder, rev_files[i]) workflow.add_task( cutadapt_do, depends=[fwd_primer_file, rev_primer_file, fwd_file, rev_file, TrackedDirectory(filtN_folder), TrackedExecutable("cutadapt",version_command="echo 'cutadapt' `cutadapt --version`")], targets=[TrackedDirectory(cutadapt_folder)], name="remove_primers" ) return cutadapt_folder