def filter_fastq(workflow, method, fastq_file, output_folder, threads, maxee, trunc_len, fastq_ascii, qmax=45): """ Filter the fastq files using the maxee value Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fastq_file (string): The path to the fastq file. output_folder (string): The path of the output folder. threads (int): The number of threads for each task. maxee (int): The maxee value to use for filtering. trunc_len (int): The value to use for max length. qmax (int): Max qvalue increased from the default of 43 to allow for Ion Torrent data Requires: usearch or vsearch Returns: string: A path to the filtered fasta file string: A path to the full fasta file """ # get the name of the final merged fastq file fasta_filtered_file = utilities.name_files( "all_samples_concatenated_filtered.fasta", output_folder) fasta_discarded_file = utilities.name_files( "all_samples_concatenated_discarded.fasta", output_folder) if method == "vsearch": workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; " + \ "vsearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]]", depends=[fastq_file, TrackedExecutable("vsearch")], targets=[fasta_filtered_file, fasta_discarded_file], args=[threads, maxee, trunc_len], name="vsearch_fastq_filter") else: workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; "+\ "usearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]] -fastq_qmax [args[3]] -fastq_ascii [args[4]]", depends=[fastq_file,TrackedExecutable("usearch")], targets=[fasta_filtered_file, fasta_discarded_file], args=[threads, maxee, trunc_len, qmax, fastq_ascii], name="usearch_fastq_filter") # create a fasta file of all reads (included the discarded fasta_file = utilities.name_files("all_samples_concatenated.fasta", output_folder) workflow.add_task("cat [depends[0]] [depends[1]] > [targets[0]]", depends=[fasta_filtered_file, fasta_discarded_file], targets=fasta_file) return fasta_filtered_file, fasta_file
def picrust(workflow, otu_table_biom, output_folder): """ Runs picrust normalize, then predict Args: workflow (anadama2.workflow): An instance of the workflow class. out_table_biom (string): The path to the biom file (closed reference otu table). output_folder (string): The path of the output folder. Requires: Picrust v1.1: Software to predict metagenome function. Returns: string: The path to the functional data file in biom format. """ # normalize the otu table normalized_otu_table = utilities.name_files( "all_samples_normalize_by_copy_number.biom", output_folder) # first remove target file as picrust will not overwrite # expects biom file is json (not hdf5) format workflow.add_task( "remove_if_exists.py [targets[0]] ; "+\ "normalize_by_copy_number.py -i [depends[0]] -o [targets[0]]", depends=[otu_table_biom,TrackedExecutable("normalize_by_copy_number.py")], targets=normalized_otu_table, name="normalize_by_copy_number.py") # predict metagenomes predict_metagenomes_table = utilities.name_files( "all_samples_predict_metagenomes.biom", output_folder) # first remove target file as picrust will not overwrite workflow.add_task( "remove_if_exists.py [targets[0]] ; "+\ "predict_metagenomes.py -i [depends[0]] -o [targets[0]]", depends=[normalized_otu_table,TrackedExecutable("predict_metagenomes.py")], targets=predict_metagenomes_table, name="predict_metagenomes.py") # categorize by function categorized_function_table = utilities.name_files( "all_samples_categorize_by_function.biom", output_folder) # first remove target file as picrust will not overwrite workflow.add_task( "remove_if_exists.py [targets[0]] ; " + \ "categorize_by_function.py -i [depends[0]] -o [targets[0]] --level 3 -c KEGG_Pathways", depends=[predict_metagenomes_table, TrackedExecutable("categorize_by_function.py")], targets=categorized_function_table, name="categorize_by_function.py") return categorized_function_table, predict_metagenomes_table
def cluster_otus(workflow, method, fasta_file, reference_fasta, output_folder): """ Cluster the otus with usearch Args: workflow (anadama2.workflow): an instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fasta_file (string): the path to the fasta file (filtered and dereplicated). reference_fasta (string): the path to reference fasta db output_folder (string): the path of the output folder. Requires: usearch or vsearch Returns: list: Path to the fasta file sorted by size """ # get the name of the output files output_fasta = utilities.name_files("all_samples_otus_nonchimeras.fasta", output_folder) if method == "vsearch": output_txt = utilities.name_files("all_samples_vsearch_otus.txt", output_folder) all_otus = utilities.name_files("all_otus.fasta", output_folder) workflow.add_task( "vsearch --cluster_size [depends[0]] --consout [targets[0]] --id 0.97 --relabel 'OTU' --uc [targets[1]]", depends=[fasta_file, TrackedExecutable("vsearch")], targets=[all_otus, output_txt], name="vsearch_cluster_otus") workflow.add_task( "vsearch --uchime_ref [depends[0]] --nonchimeras [targets[0]] --strand plus --db [args[0]]", depends=[all_otus, TrackedExecutable("vsearch")], targets=[output_fasta], args=[reference_fasta], name="vsearch_nochim") else: output_txt = utilities.name_files("all_samples_uparse_otus.txt", output_folder) workflow.add_task( "usearch -cluster_otus [depends[0]] -otus [targets[0]] -relabel 'OTU' -uparseout [targets[1]]", depends=[fasta_file, TrackedExecutable("usearch")], targets=[output_fasta, output_txt], name="usearch_cluster_otus") return output_fasta
def merge_fastq(workflow, input_files, output_folder): """ Merge all of the fastq files into a single fastq file Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq files. output_folder (string): The path of the output folder. Requires: None Returns: string: A path to the merged file """ # get the name of the final merged fastq file all_samples_fastq = utilities.name_files("all_samples_concatenated.fastq", output_folder) workflow.add_task("merge_fastq.py [args[0]] _renamed.fastq [targets[0]]", depends=input_files, targets=all_samples_fastq, args=os.path.dirname(input_files[0])) return all_samples_fastq
def convert_from_biom_to_tsv_list(workflow, input_files, output): # if any of the files provided are of type biom then convert to tsv if isinstance(input_files, dict): converted_files = {} else: converted_files = [] for filename in input_files: if filename.endswith(".biom"): new_tsv = utilities.name_files(filename.replace(".biom", ".tsv"), output, subfolder="biom_to_tsv", create_folder=True) convert_from_biom_to_tsv(workflow, filename, new_tsv) if isinstance(input_files, dict): converted_files[new_tsv] = input_files[filename] else: converted_files.append(new_tsv) else: if isinstance(input_files, dict): converted_files[filename] = input_files[filename] else: converted_files.append(filename) return converted_files
def sort_by_size(workflow, method, fasta_file, output_folder, min_size): """ Sort reads by size, removing those that are not of min size Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fasta_file (string): The path to the fasta file (filtered and dereplicated). output_folder (string): The path of the output folder. min_size (int): Min size of the reads to filter. Requires: usearch or vsearch Returns: list: Path to the fasta file sorted by size """ # get the name of the output files output_file = utilities.name_files("all_samples_sorted.fasta", output_folder) if method == "vsearch": workflow.add_task( "vsearch --sortbysize [depends[0]] --output [targets[0]] --minsize [args[0]]", depends=[fasta_file, TrackedExecutable("vsearch")], targets=output_file, args=min_size, name="vsearch_sortbysize") else: workflow.add_task( "usearch -sortbysize [depends[0]] -fastaout [targets[0]] -minsize [args[0]]", depends=[fasta_file, TrackedExecutable("usearch")], targets=output_file, args=min_size, name="usearch_sortbysize") return output_file
def excel_to_csv(workflow, input_files, output_dir): """Converts an Excel file to a CSV file. Only attempts to convert the first worksheet in the file and ignores the rest. Args: workflow (anadama2.Workflow): The AnADAMA2 workflow object. input_files (list): A list containing all Excel files to be converted. output_dir (string): The output directory to write converted CSV files too. Requires: None Returns: list: A list of newly-converted CSV files. """ output_files = bb_utils.name_files(map(os.path.basename, input_files), output_dir, extension='csv') def _convert_excel_csv(task): """Helper function passed to AnADAMA2 doing the lifting of converting the supplied Excel file to a CSV file using the pandas python library. """ excel_file = task.depends[0].name csv_out_file = task.targets[0].name excel_df = pd.read_excel(excel_file) excel_df.to_csv(csv_out_file) workflow.add_task_group(_convert_excel_csv, depends=input_files, targets=output_files) return output_files
def build_otu_tables(workflow, reference_taxonomy, reference_fasta, reference_mapping_results_uc, otu_mapping_results_uc, otu_fasta, original_fasta, output_folder): """ Build the open/closed reference otu tables, denovo table, and corresponding fasta files Args: workflow (anadama2.workflow): An instance of the workflow class. reference_taxonomy (string): The path to the reference taxonomy file. reference_fasta (string): The path to the reference fasta file. reference_mapping_results_uc (string): The path to the reference mapping uc results file. otu_mapping_results_uc (string): The path to the otu mapping uc results file. otu_fasta (string): The path to the fasta file of otu sequences. original_fasta (string): The path to the fasta file (not qc or truncated). output_folder (string): The path of the output folder. Requires: None Returns: list: The path to the closed reference otu files (tsv and fasta) """ # name the output files open_ref_tsv = files.SixteenS.path("otu_table_open_reference", output_folder) open_ref_fasta = utilities.name_files("all_samples_open_reference.fasta", output_folder) closed_ref_tsv = files.SixteenS.path("otu_table_closed_reference", output_folder) closed_ref_fasta = utilities.name_files( "all_samples_closed_reference.fasta", output_folder) denovo_tsv = utilities.name_files("all_samples_denovo_otu_table.tsv", output_folder) read_counts = files.SixteenS.path("read_count_table", output_folder) workflow.add_task( "create_otu_tables_from_alignments.py [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]] [depends[5]] "+\ "[targets[0]] [targets[1]] [targets[2]] [targets[3]] [targets[4]] [targets[5]]", depends=[reference_taxonomy, reference_fasta, reference_mapping_results_uc, otu_mapping_results_uc, otu_fasta, original_fasta], targets=[open_ref_tsv,open_ref_fasta,closed_ref_tsv,closed_ref_fasta,denovo_tsv,read_counts]) return closed_ref_tsv, closed_ref_fasta
def run_picrust2(task, threads, otus=False): """ Run picrust2, first changing sequence ids to avoid all numeric (as per picrust2 tutorial) """ picrust2_input_dir = os.path.dirname(task.depends[0].name) picrust2_output_dir = os.path.dirname(task.targets[0].name) if otus: reformat_input_fasta = utilities.name_files( task.depends[0].name, picrust2_input_dir, tag="picrust_reformatted_input", create_folder=True) with open(task.depends[0].name) as file_handle: with open(reformat_input_fasta, "w") as file_handle_write: for line in file_handle: if line.startswith(">"): line = line.replace(">", ">seq") file_handle_write.write(line) else: reformat_input_fasta = task.depends[0].name reformat_input_tsv = utilities.name_files(task.depends[1].name, picrust2_input_dir, tag="picrust_reformatted_input") with open(task.depends[1].name) as file_handle: with open(reformat_input_tsv, "w") as file_handle_write: header = file_handle.readline() header = "\t".join(header.split("\t")[:-1]) + "\n" file_handle_write.write(header) for line in file_handle: if otus: line = "seq" + line line = "\t".join(line.split("\t")[:-1]) + "\n" file_handle_write.write(line) utilities.run_task( "remove_if_exists.py [args[0]] --is-folder ; picrust2_pipeline.py -s [args[1]] -i [args[2]] -o [args[0]] -p [args[3]]", depends=task.depends, targets=task.depends, args=[ picrust2_output_dir, reformat_input_fasta, reformat_input_tsv, threads ])
def batch_convert_tsv_to_biom(workflow, tsv_files): """Batch converts tsv files to the biom format. BIOM files will be deposited in the same folder as source TSV files and will carry the same filenames. Args: workflow (anadama2.Workflow): The workflow object. tsv_files (list): A list containing all TSV files to be converted to BIOM format. Requires: Biom v2: A tool for general use formatting of biological data. Returns: list: A list containing paths to all converted BIOM files. Example: from anadama2 import Workflow from hmp2_workflows.tasks import common workflow = anadama2.Workflow() tsv_files = ['/tmp/foo.tsv', '/tmp/bar.tsv', '/tmp/baz.tsv'] biom_files = common.batch_convert_tsv_to_biom(workflow, tsv_files) print biom_files ## ['/tmp/foo.biom', '/tmp/bar.biom', '/tmp/baz.biom'] """ biom_files = [] tsv_fnames = bb_utils.sample_names(tsv_files, '.tsv') tsv_dir = os.path.dirname(tsv_files[0]) biom_dir = os.path.join(tsv_dir, 'biom') bb_utils.create_folders(biom_dir) biom_files = [ os.path.join(biom_dir, biom_fname) for biom_fname in bb_utils.name_files(tsv_fnames, biom_dir, extension='biom') ] for (tsv_file, biom_file) in zip(tsv_files, biom_files): convert_to_biom_from_tsv(workflow, tsv_file, biom_file) return biom_files
def truncate(workflow, method, input_files, output_folder, threads, trunc_len, fastq_ascii): """ Truncate the fasta sequences by length Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch input_files (list): A list of paths to fastq files. output_folder (string): The path of the output folder. threads (int): The number of threads for each task. trunc_len (int): The value to use for max length. Requires: usearch or vsearch Returns: list: Paths to the truncated files """ # get the name of the output files output_files = utilities.name_files(input_files, output_folder, tag="truncated") if method == "vsearch": workflow.add_task_group( "vsearch --fastx_filter [depends[0]] --fastq_trunclen [args[0]] -fastaout [targets[0]]", depends=input_files, targets=output_files, args=trunc_len, name="vsearch_fastx_truncate") else: workflow.add_task_group( "usearch -fastx_truncate [depends[0]] -trunclen [args[0]] -fastaout [targets[0]] -fastq_ascii [args[1]]", depends=input_files, targets=output_files, args=[trunc_len, fastq_ascii], name="usearch_fastx_truncate") return output_files
def dereplicate(workflow, method, fasta_file, output_folder, threads): """ Dereplicate reads Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch fasta_file (string): The path to the fasta file (filtered and dereplicated). output_folder (string): The path of the output folder. threads (int): The number of threads for each task. Requires: usearch or vsearch Returns: list: Path to the dereplicated fasta file """ # get the name of the output files output_file = utilities.name_files("all_samples_dereplicated.fasta", output_folder) if method == "vsearch": workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; " + \ "vsearch --derep_fulllength [depends[0]] --output [targets[0]] --sizein --sizeout --threads [args[0]]", depends=[fasta_file, TrackedExecutable("vsearch")], targets=output_file, args=threads, name="vsearch_derep_fulllength") else: workflow.add_task( "export OMP_NUM_THREADS=[args[0]]; "+\ "usearch -derep_fulllength [depends[0]] -fastaout [targets[0]] -sizeout -threads [args[0]]", depends=[fasta_file,TrackedExecutable("usearch")], targets=output_file, args=threads, name="usearch_derep_fulllength") return output_file
def generate_md5_checksums(workflow, files): """Generates MD5 checksums for the provided set of files. All checksums are written to a file containing the same name as the input but with the "md5" extension appended. Args: workflow (anadama2.Workflow): The workflow object. files (list): A list of files to package together into a tarball. output_tarball (string): The desired output tarball file. Requires: None Returns: list: A list of the generated md5 checksum files. Example: from anadama2 import Workflow from hmp2_workflows.tasks import common workflow = anadama2.Workflow() files = ['/tmp/foo.txt', '/tmp/bar.txt'] md5sum_files = common.generate_md5_checksums(workflow, files) """ output_dir = os.path.dirname(files[0]) checksum_files = bb_utils.name_files(bb_utils.sample_names(files), output_dir, extension=".md5") workflow.add_task_gridable('md5sum [depends[0]] > [targets[0]]', depends=files, targets=checksum_files) return checksum_files
def demultiplex_dual(workflow, output_folder, input_files, extension, barcode_files, dual_barcode_path, min_phred, pair_identifier): """Demultiplex the files (dual indexed paired) Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq(gz) files for input to ea-utils. extension (string): The extension for all files. output_folder (string): The path of the output folder. barcode_files (list): A list of barcode files. dual_index_path (string): A paths to the dual index file. min_phred (int): The min phred quality score to use in the demultiplex command. pair_identifier (string): The string in the file basename to identify the first pair in the set. Requires: ea-utils fastq-multx: A tool to demultiplex fastq files. Returns: list: A list of the demultiplexed files string: output folder of demultiplexed files """ # capture the demultiplex stats in log file, one for each set of input files demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log",create_folder=True) demultiplex_output_folder = os.path.dirname(demultiplex_log) # create a tracked executable fastq_multx_tracked = TrackedExecutable("fastq-multx", version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`") # check for paired input files input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier) # get barcode files barcode1, barcode2 = utilities.paired_files(barcode_files, extension, pair_identifier) # get the second pair identifier pair_identifier2 = pair_identifier.replace("1", "2", 1) try: file_handle = open(dual_barcode_path) lines = file_handle.readlines() file_handle.close() except EnvironmentError: sys.exit("ERROR: Unable to read dual barcode file: " + dual_barcode_path) run_name = os.path.basename(input_pair1[0]).replace(pair_identifier, "").replace("." + extension, "") demultiplex_files = set() for line in lines: # ignore headers or comment lines if not line.startswith("#"): sample_name = line.split("\t")[0] if sample_name: nm1 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier + "." + extension nm2 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier2 + "." + extension demultiplex_files.add(nm1) demultiplex_files.add(nm2) # get the names of the expected output files # demultiplex_files = utilities.name_files(samples, demultiplex_output_folder, extension=extension) workflow.add_task( "fastq-multx -B [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]]\ -o n/a -o n/a -o [args[0]]/[args[5]]_%[args[3]].[args[1]] -o [args[0]]/[args[5]]_%[args[4]].[args[1]]\ -q [args[2]] > [targets[0]]", depends=[dual_barcode_path, barcode1[0], barcode2[0], input_pair1[0], input_pair2[0]], args=[demultiplex_output_folder, extension, min_phred, pair_identifier, pair_identifier2, run_name, fastq_multx_tracked], targets=[demultiplex_log, TrackedDirectory(demultiplex_output_folder)], name="demultiplex_dual") demultiplex_files = demultiplex_check(workflow, demultiplex_log, demultiplex_files) return demultiplex_files, demultiplex_output_folder
error_ratesR_path, args.threads, args.minoverlap, args.maxmismatch) # construct otu seqtab_file_path, read_counts_steps_path, seqs_fasta_path = dadatwo.const_seq_table( workflow, args.output, filtered_dir, mergers_file_path, args.threads) # centroid alignment centroid_fasta = files.SixteenS.path("msa_nonchimera", args.output) sixteen_s.centroid_alignment(workflow, seqs_fasta_path, centroid_fasta, args.threads, task_name="clustalo_nonchimera") # phylogenetic tree closed_tree = utilities.name_files("closed_reference.tre", args.output) sixteen_s.create_tree(workflow, centroid_fasta, closed_tree) # assign taxonomy closed_reference_tsv = dadatwo.assign_taxonomy(workflow, args.output, seqtab_file_path, args.dada_db, args.threads) # functional profiling # check for picrust1 as not an option with this workflow if args.picrust_version == "1": print( "WARNING: PICRUSt v1 is not compatible with ASV tables so will not be run for this workflow." ) else: categorized_function = sixteen_s.functional_profile(
def add_metadata_to_tsv(workflow, analysis_files, metadata_file, dtype, id_col, col_replace=None, col_offset=-1, metadata_rows=None, target_cols=None, aux_files=None, na_rep=""): """Adds metadata to the top of a tab-delimited file. This function is meant to be called on analysis files to append relevant metadata to the analysis output found in the file. An example can be seen below: sample Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Sample7 Sample8 Age 87 78 3 2 32 10 39 96 Cohort Healthy Healthy Healthy Healthy IBD IBD IBD IBD Favorite_color Yellow Blue Green Yellow Green Blue Green Height 60 72 63 67 71 65 61 64 Sex 0 1 0 1 1 0 1 0 Smoking 0 0 1 0 1 1 1 0 Star_Trek_Fan 1 1 0 0 1 0 0 1 Weight 151 258 195 172 202 210 139 140 Bacteria 1 1 1 1 1 1 1 1 Bacteria|Actinobacteria|Actinobacteria 0.0507585 0.252153 0.161725 Args: workflow (anadama2.Workflow): The AnADAMA2 workflow object. analysis_files (list): Target TSV's to add metadata too metadata_file (string): The path to the metadata file to pull from. dtype (string): Data type of files for which metadata is being refreshed to include. id_col (string): The column name in the supplied metadata file to attempt to subset on using ID's from the analysis file. col_replace (list): A list of string fragments that should be searched for and replaced in either of the column headers of the analysis or metadata files. col_offset (int): In certain situations a series of metadata columns will be present prior to columns containing analysis results. In these cases an offset needs to be provided for proper creation of PCL files. metadata_rows (int): If our analysis file already contains some metadata files at the top of the file (in effect already a PCL file) this parameter indicates how many rows of metadata exist. target_cols (list): A list of columns to filter the metadata file on. aux_files (list): Any additional metadata files to integrate into analysis files. na_rep (string): String representation for any empty cell in our PCL file. Defaults to an empty string. Requires: None Returns: list: A list containing the path to all modified files. Example: from anadama2 import Workflow from hmp2_workflows.tasks import metadata workflow = anadama2.Workflow() target_cols = ['age', 'sex', 'smoking'] col_replace = ['_taxonomic_profile', '_functional_profile'] out_files = metadata.add_metadata_to_tsv(workflow, ['/tmp/metaphlan2.out'], 'External ID', col_replace, '/tmp/metadata.tsv', target_cols) print out_files ## ['/tmp/metaphlan2.out'] """ metadata_df = pd.read_csv(metadata_file, dtype='str', parse_dates=['date_of_receipt']) def _workflow_add_metadata_to_tsv(task): analysis_file = task.depends[0].name pcl_out = task.targets[0].name analysis_df = pd.read_csv(analysis_file, dtype='str', header=None) pcl_metadata_df = None header = True # Going to make the assumption that the next row following our PCL # metadata rows is the row containing the ID's that we will use to merge # the analysis file with our metadata file and we can use these same # ID's to merge the PCL metadata rows into the larger metadata file. if metadata_rows: pcl_metadata_df = analysis_df[:metadata_rows+1] header = None offset_cols = range(0, col_offset+1) pcl_metadata_df.drop(pcl_metadata_df.columns[offset_cols[:-1]], axis=1, inplace=True) pcl_metadata_df = pcl_metadata_df.T.reset_index(drop=True).T pcl_metadata_df.xs(metadata_rows)[0] = id_col pcl_metadata_df = pcl_metadata_df.T pcl_metadata_df = reset_column_headers(pcl_metadata_df) analysis_df.drop(analysis_df.index[range(0,metadata_rows)], inplace=True) analysis_df.rename(columns=analysis_df.iloc[0], inplace=True) else: analysis_df = hmp2_utils.misc.reset_column_headers(analysis_df) sample_ids = analysis_df.columns.tolist()[col_offset+1:] if len(sample_ids) == 1: raise ValueError('Could not parse sample ID\'s:', sample_ids) if col_replace: new_ids = sample_ids for replace_str in col_replace: new_ids = [sid.replace(replace_str, '') if not pd.isnull(sid) else sid for sid in new_ids] if new_ids != sample_ids: sample_ids_map = dict(zip(sample_ids, new_ids)) sample_ids = new_ids analysis_df.rename(columns=sample_ids_map, inplace=True) subset_metadata_df = metadata_df[(metadata_df.data_type == dtype) & (metadata_df[id_col].isin(sample_ids))] if aux_files: for aux_file in aux_files: aux_metadata_df = pd.read_table(aux_file, dtype='str') aux_metadata_cols = aux_metadata_df.columns.tolist() join_id = aux_metadata_cols[0] ## We need to do this in two stages. If the columns already exist ## here we want to update them. If they do not exist we append ## them. subset_metadata_cols = subset_metadata_df.columns.tolist() new_cols = set(aux_metadata_cols[1:]) - set(subset_metadata_cols) existing_cols = set(aux_metadata_cols[1:]).intersection(subset_metadata_cols) if new_cols: aux_metadata_new_df = aux_metadata_df.filter(items=aux_metadata_cols[:1] + list(new_cols)) subset_metadata_df = pd.merge(subset_metadata_df, aux_metadata_new_df, how='left', on=join_id) if existing_cols: aux_metadata_existing_df = aux_metadata_df.filter(items=aux_metadata_cols[:1] + list(existing_cols)) subset_metadata_df.set_index(join_id, inplace=True) aux_metadata_existing_df.set_index(join_id, inplace=True) subset_metadata_df.update(aux_metadata_existing_df) subset_metadata_df.reset_index(inplace=True) if not pcl_metadata_df.empty: subset_metadata_df = pd.merge(subset_metadata_df, pcl_metadata_df, how='left', on=id_col) if target_cols: target_cols.insert(0, id_col) subset_metadata_df = subset_metadata_df.filter(target_cols) subset_metadata_df = subset_metadata_df.T subset_metadata_df = reset_column_headers(subset_metadata_df) subset_metadata_df = subset_metadata_df.reset_index() subset_metadata_df.fillna('NA', inplace=True) _col_offset = col_offset-1 if col_offset != -1 else col_offset col_name = analysis_df.columns[_col_offset+1] col_name = '' if col_name == "index" else col_name subset_metadata_df.rename(columns={'index': col_name}, inplace=True) analysis_df.index = analysis_df.index + len(subset_metadata_df.index) analysis_metadata_df = pd.concat([subset_metadata_df, analysis_df], axis=0) analysis_metadata_df = analysis_metadata_df[analysis_df.columns] analysis_metadata_df.to_csv(pcl_out, index=False, header=header, sep='\t', na_rep=na_rep) output_folder = os.path.dirname(analysis_files[0]) pcl_files = bb_utils.name_files(analysis_files, output_folder, extension="pcl.tsv") # Because of how YAML inherits lists we'll need to see if we can't # flatten this list out. target_cols = funcy.flatten(target_cols) workflow.add_task_group(_workflow_add_metadata_to_tsv, depends=analysis_files, targets=pcl_files, time="1*60 if ( file_size('depends[0]]') < 1 else 2*60", mem="4*1024 if ( file_size('depends[0]]') < 1 else 3*12*1024" , cores=1, name="Generate analysis PCL output file") return pcl_files
def generate_sample_metadata(workflow, data_type, in_files, metadata_file, output_dir, id_column = 'External ID'): """Generates a series of individual metadata files in CSV format from the provided merged metadata file. Each of the provided samples has a metadata file generated to accompany any product files generated by the analysis pipelines. Args: workflow (anadama2.Workflow): The workflow object. data_type (string): The data type of the provided samples. One of either 'metageonimcs', 'proteomices', 'amplicon'. in_files (list): A list of files that should have corresponding metadata files written. metadata_file (string): Path to the merged metadata file. id_column (string): The ID column to attempt to map sample names to in the merged metadata file. Default set to "External ID" but can change depending on the data type. output_dir (string): Path to output directory to write each sample metadata file too. Requires: None Returns: list: A list containing the path to all sample metadata files created. Example: from anadama2 import Workflow from hmp2_workflows.tasks import metadata workflow = anadama2.Workflow() samples = ['sampleA', 'sampleB'] metadadta_file = '/tmp/merged_metadata.csv' output_dir = '/tmp/metadata' metadata_files = metadata.generate_sample_metadata(workflow, 'metagenomics', samples, metadata_file, output_dir) print metadata_files ## ['/tmp/metadata/sampleA.csv', '/tmp/metadata/sampleB.csv'] """ metadata_df = pd.read_csv(metadata_file) samples = bb_utils.sample_names(in_files) output_metadata_files = bb_utils.name_files(samples, output_dir, extension = 'csv', subfolder = 'metadata', create_folder = True) sample_metadata_dict = dict(zip(samples, output_metadata_files)) def _workflow_gen_metadata(task): metadata_subset = metadata_df.loc[(metadata_df[id_column].isin(samples)) & (metadata_df['data_type'] == data_type)] if metadata_subset.empty: raise ValueError('Could not find metadata associated with samples.', ",".join(samples)) for (sample_id, row) in metadata_subset.iterrows(): sample_metadata_file = sample_metadata_dict.get(row[id_column]) metadata_subset.xs(sample_id).to_csv(sample_metadata_file, index=False) workflow.add_task(_workflow_gen_metadata, targets=output_metadata_files, depends=in_files + [metadata_file], name='Generate sample metadata') return sample_metadata_dict.values()
exit_if_not_found=True) ### STEP #1: Run quality control on all input files ### sample_names = utilities.sample_names(input_files, args.input_extension) input_pair1, input_pair2 = utilities.paired_files(input_files, args.input_extension, args.pair_identifier) paired = False if input_pair1: sample_names = utilities.sample_names(input_pair1, args.input_extension, args.pair_identifier) qc_targets = [ utilities.name_files([ name + ".trimmed.1.fastq", name + ".trimmed.2.fastq", name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq", name + ".trimmed.single.12.fastq" ], args.output, subfolder="kneaddata", create_folder=True) for name in sample_names ] paired = True for target_set, input_R1, input_R2, name in zip(qc_targets, input_pair1, input_pair2, sample_names): workflow.add_task( "kneaddata --run-fastqc-start --input [depends[0]] --input [depends[1]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]] && cat [args[3]] [args[4]] > [targets[2]]", depends=[input_R1, input_R2, TrackedExecutable("kneaddata")], targets=[target_set[0], target_set[1], target_set[4]], args=[ os.path.dirname(target_set[0]), args.threads, name, target_set[2], target_set[3]
def main(workflow): args = workflow.parse_args() conf_mtx = parse_cfg_file(args.config_file, section='MTX') conf_mgx = parse_cfg_file(args.config_file, section='MGX') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') adapters_file = manifest.get('adapters_file') contaminate_db = conf_mtx.get('databases').get('knead_dna') mtx_db = conf_mtx.get('databases').get('knead_mtx') rrna_db = conf_mtx.get('databases').get('knead_rrna') adapter_sequences = conf_mtx.get('adapter_sequences') qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads func_threads = args.threads_humann if args.threads_humann else args.threads if data_files and data_files.get('MTX', {}).get('input'): input_files_mtx = data_files.get('MTX').get('input') file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq') pair_identifier_mtx = data_files.get('MTX').get('pair_identifier') input_file_tags = data_files.get('MTX').get('tags') input_tax_profiles = [] project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'), conf_mtx.get('processing_dir'), conf_mtx.get('public_dir')], project, creation_date, 'MTX') public_dir_mtx = project_dirs_mtx[-1] base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) deposited_files_mtx = stage_files(workflow, input_files_mtx, project_dirs_mtx[0], symlink=True) if file_extension_mtx == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files_mtx, project_dirs_mtx[1], paired_end=True, compress=False, threads=args.threads) pair_identifier_mtx = "_R1" else: paired_end_seqs = deposited_files_mtx if adapters_file: adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE " "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file) (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow, paired_end_seqs, file_extension_mtx, project_dirs_mtx[1], qc_threads, databases=[contaminate_db, rrna_db, mtx_db], pair_identifier=pair_identifier_mtx, additional_options=adapter_trim_opts, remove_intermediate_output=True) sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx) ########################################## # MGX FILE PROCESSING # ########################################## # Ideally we would be passed in a set of corresponding metagenome # sequence(s) to go with our metatranscriptomic files but we also # have two other scenarios: # # 1.) No accompanying metagenomic sequences exist; in this # case we will proceed just using the metatranscriptomic # data. # 2.) Taxonomic profiles are passed directly in in our MANIFEST # file; here we remove these from our input files and # prevent them from running through the kneaddata -> # metaphlan2 portions of our pipeline if data_files.get('MGX', {}).get('input'): input_files_mgx = data_files.get('MGX').get('input') file_extension_mgx = data_files.get('MGX').get('file_ext') pair_identifier_mgx = data_files.get('MGX').get('pair_identifier') input_tax_profiles = [in_file for in_file in input_files_mgx if 'taxonomic_profile.tsv' in in_file] input_files_mgx = set(input_files_mgx) - set(input_tax_profiles) if input_files_mgx: sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx) project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'), conf_mgx.get('processing_dir'), conf_mgx.get('public_dir')], project, creation_date, 'WGS') public_dir_mgx = project_dirs_mgx[-1] deposited_files_mgx = stage_files(workflow, input_files_mgx, project_dirs_mgx[0], symlink=True) if file_extension_mgx == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files_mgx, project_dirs_mgx[1], paired_end=True, compress=False, threads=args.threads) pair_identifier_mgx = "_R1" else: paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx) (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow, paired_end_seqs_mgx, project_dirs_mgx[1], qc_threads, [contaminate_db, rrna_db], remove_intermediate_output=True) tax_outs_mgx = taxonomic_profile(workflow, cleaned_fastqs_mgx, project_dirs_mgx[1], tax_threads, '*.fastq') func_outs_mgx = functional_profile(workflow, cleaned_fastqs_mgx, project_dirs_mgx[1], func_threads, tax_outs_mgx[1], remove_intermediate_output=True) input_tax_profiles.extend(tax_outs_mgx[1]) pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw') pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile') pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile') map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir, pub_wgs_func_profile_dir]) norm_genefamilies_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='ecs', tag='genefamilies_ecs_relab', extension='tsv') norm_path_files_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='pathways', tag='pathabundance_relab', extension='tsv') pcl_files = add_metadata_to_tsv(workflow, [tax_outs_mgx[1]] + func_outs_mgx, 'metagenomics', conf_mgx.get('metadata_id_col'), conf_mgx.get('analysis_col_patterns'), conf_mgx.get('target_metadata_cols')) func_tar_files_wgs = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx, norm_genefamilies_mgx, norm_ecs_files_mgx, norm_path_files_mgx): tar_path = os.path.join(pub_wgs_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_outs_mgx) func_tar_files_wgs.append(func_tar_file) ########################################## # MTX FILE PROCESSING # ########################################## # Here we want to see if we can create a set of matching cleaned # MTX files to corresponding MGX taxonomic profiles. If these exist # we want to run functional profiling wit hthe corresponding MGX # taxonomic profile otherwise we will run a taxonomic profiling # on the MTX sequences and run functional profiling with the produced # taxonomic profile. func_outs_match_mtx = [] if input_tax_profiles: (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx, '.fastq', data_files.get('MTX').get('metadata_id_col', 'External ID'), input_tax_profiles, data_files.get('MGX').get('tax_profile_id', 'External ID'), args.metadata_file, tags=input_file_tags) func_outs_match_mtx = functional_profile(workflow, matched_fqs, project_dirs_mtx[1], func_threads, matched_tax_profiles, remove_intermediate_output=True) # Reset the remaining MTX files left over here so that we can run them through # the metaphlan2 -> humann2 pipeline. cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs) if cleaned_fastqs_mtx: tax_outs_mtx = taxonomic_profile(workflow, cleaned_fastqs_mtx, project_dirs_mtx[1], tax_threads, '*.fastq') func_outs_mtx = functional_profile(workflow, cleaned_fastqs_mtx, file_extension_mtx, project_dirs_mtx[1], func_threads, tax_outs_mtx[1], remove_intermediate_output=True) func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx) else: func_outs_mtx = func_outs_match_mtx # We'll need to generate DNA/RNA normalized files to be displayed # in our visualization output. (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow, func_outs_mgx[0], func_outs_mgx[1], func_outs_mgx[2], func_outs_mtx[0], func_outs_mtx[1], func_outs_mtx[2], project_dirs_mtx[1]) pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw') pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile') pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile') map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir, pub_mtx_func_profile_dir]) norm_genefamilies_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='ecs', tag='genefamilies_ecs_relab', extension='tsv') norm_path_files_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='pathways', tag='pathabundance_relab', extension='tsv') func_tar_files_mtx = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx, norm_genefamilies_mtx, norm_ecs_files_mtx, norm_path_files_mtx): tar_path = os.path.join(pub_mtx_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_outs_mtx) func_tar_files_mtx.append(func_tar_file) workflow.go()
def stage_files(workflow, input_files, target_dir, delete=False, preserve=False, symlink=False): """Moves data files from the supplied origin directory to the supplied destination directory. In order to include a file verification check in the staging process rsync is used by default to copy files. If the symlink parameter is set to True this function will instead create symlinks from the origin directory to the target directory. An optional parameter may be provided to only stage files with the corresponding extension. Args: workflow (anadama2.Workflow): The workflow object. input_files: A collection of input files to be staged. dest_dir (string): Path to destination directory where files should be moved. preserve (boolean): If set to True preserve the source subdirectory structure on the target side. symlink (boolean): By default create symlinks from the origin directory to the destination directory. If set to False files will be copied using rsync. Requires: rsync v3.0.6+: A versatile file copying tool. Returns: list: A list of all files that were successfuly staged Example: from anadama2 import Workflow from hmp2_workflows.tasks import common workflow = anadama2.Workflow() staged_files = common.stage_files(workflow, ['/tmp/fooA.sam', '/tmp/fooB.sam'], '/tmp/out_dir') workflow.go() """ if not os.path.exists(target_dir): raise OSError(2, 'Target directory does not exist', target_dir) ## TODO: We need to preserve the file directory structure here because ## it tells when the files were received and is used by the website. target_files = bb_utils.name_files(input_files, target_dir) ## TODO: Figure out a better way to handle this rather than creating ## N rsync calls. stage_cmd = "remove_if_exists.py [targets[0]] ; rsync -avz [depends[0]] [targets[0]]" if preserve: stage_cmd = stage_cmd.replace( '-avz', '--rsync-path=\"mkdir -p `dirname ' '[depends[0]]`\" -avz') if symlink: stage_cmd = "remove_if_exists.py [targets[0]] ; ln -s [depends[0]] [targets[0]]" workflow.add_task_group(stage_cmd, depends=input_files, targets=target_files) return target_files
def deinterleave_fastq(workflow, input_files, output_dir, threads=1, compress=True): """Deinterleaves a FASTQ file producing paired-end FASTQ reads. Args: workflow (anadama2.Workflow): The AnADAMA2 Workflow object. input_files (list) A list of FASTQ files to deinterleave. output_dir (string): The output directory to write paired-end reads too. compress (bool): Compress FASTQ files generated. threads (int): The number of threads/cores to be used if compressing paired ends reads. Requires: None Returns: list: A list of paired-end files. Example: from anadama2 import Workflow from hmp2_workflows.tasks.file_conv import deinterleave_fastq workflow = Workflow() paired_end_fastqs = deinterleave_fastq(workflow, ['foo.fasta', 'bar.fastq'], threads=4) print paired_end_fastqs # [[foo_R1.fastq.gz, foo_R2.fastq.gz], [bar_R1.fastq.gz, bar_R2.fastq.gz]] """ paired_end_reads = [] deinterleave_cmd = "deinterleave_fastq.sh < [depends[0]] [targets[0]] [targets[1]]" out_ext = "fastq" if compress: deinterleave_cmd += " " + "compress" out_ext = "fastq.gz" deinterleave_cmd += " " + str(threads) mate_1_files = bb_utils.name_files(map(os.path.basename, input_files), output_dir, tag="R1", extension=out_ext) mate_2_files = bb_utils.name_files(map(os.path.basename, input_files), output_dir, tag="R2", extension=out_ext) if "gz" in out_ext: mate_1_files = [ fname.replace('.fastq_R1', '_R1.fastq') for fname in mate_1_files ] mate_2_files = [ fname.replace('.fastq_R2', '_R2.fastq') for fname in mate_2_files ] output_files = zip(mate_1_files, mate_2_files) workflow.add_task_group_gridable(deinterleave_cmd, depends=input_files, targets=output_files, time=5 * 60, mem=4096, cores=threads) return output_files
def functional_profile(workflow, closed_reference_tsv, closed_reference_fasta, picrust_version, threads, output_folder, otus): """ Run picrust for functional profiling Args: workflow (anadama2.workflow): An instance of the workflow class. closed_reference_tsv (string): The path to the closed reference tsv file. closed_reference_fasta (string): The path to the closed reference fasta file. picrust_version (str): The version of picrust to use. threads (int): The number of threads/cores for each task. output_folder (string): The path of the output folder. otus (bool): Are the inputs from OTUs (so all numerical ids). Requires: Picrust v1.1 or v2: Software to predict metagenome function. Biom v2: A tool for general use formatting of biological data. Returns: string: The path to the functional data file in tsv format. """ if picrust_version == "1": # convert the tsv file to biom format closed_reference_biom_file = utilities.name_files(closed_reference_tsv, output_folder, extension="biom") convert_to_biom_from_tsv( workflow, closed_reference_tsv, closed_reference_biom_file, options= "--process-obs-metadata=taxonomy --output-metadata-id=taxonomy") # run picrust to get functional data functional_data_categorized_biom, functional_data_predicted_biom = picrust( workflow, closed_reference_biom_file, output_folder) # convert the predited biom file to tsv functional_data_predicted_tsv = utilities.name_files( functional_data_predicted_biom, output_folder, extension="tsv") convert_from_biom_to_tsv(workflow, functional_data_predicted_biom, functional_data_predicted_tsv) # convert the categorized biom file to tsv functional_data_categorized_tsv = utilities.name_files( functional_data_categorized_biom, output_folder, extension="tsv") convert_from_biom_to_tsv(workflow, functional_data_categorized_biom, functional_data_categorized_tsv) return functional_data_categorized_tsv else: # run the v2 pipeline functional_data_predicted_tre = utilities.name_files( "out.tre", output_folder, subfolder="picrust2") workflow.add_task( utilities.partial_function(run_picrust2, threads=threads, otus=otus), depends=[closed_reference_fasta, closed_reference_tsv], targets=functional_data_predicted_tre) return functional_data_predicted_tre
def merge_pairs_and_rename(workflow, method, input_files, extension, output_folder, pair_identifier, threads, fastq_ascii): """ Merge the files if pairs and rename sequence ids to match sample id Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis, usearch default or vsearch input_files (list): A list of paths to fastq files. extension (string): The extension for all files. output_folder (string): The path of the output folder. pair_identifier (string): The string in the file basename to identify the first pair in the set. threads (int): The number of threads for each task. Requires: usearch or vsearch Returns: list: A list of the renamed files. """ pair1, pair2 = utilities.paired_files(input_files, extension, pair_identifier) if pair1 and pair2: # paired input files were found # if the files are gzipped, first decompress as fastq_mergepairs will take in fastq.gz but the output will not be correctly formatted if pair1[0].endswith(".gz"): # get the names of the decompressed output files decompressed_pair1 = utilities.name_files( [os.path.basename(file).replace(".gz", "") for file in pair1], output_folder, subfolder="merged_renamed") # get the names of the decompressed output files decompressed_pair2 = utilities.name_files( [os.path.basename(file).replace(".gz", "") for file in pair2], output_folder, subfolder="merged_renamed") # add tasks to decompress the files workflow.add_task_group("gunzip -c [depends[0]] > [targets[0]]", depends=pair1 + pair2, targets=decompressed_pair1 + decompressed_pair2) # the pair files to be used for the remaining tasks are those that are decompressed pair1 = decompressed_pair1 pair2 = decompressed_pair2 # get the sample names from the input file names sample_names = [ os.path.basename(file).replace(pair_identifier + ".fastq", "") for file in pair1 ] # get the names of the output files stitched_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="stitched", extension="fastq", create_folder=True) unjoined_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="unjoined", extension="fastq") # run usearch to merge pairs, if input files are non-empty for read1, read2, stitched_output, unjoined_output in zip( pair1, pair2, stitched_files, unjoined_files): if method == 'vsearch': workflow.add_task( utilities.partial_function(merge_pairs, method="vsearch", threads=threads), depends=[read1, read2, TrackedExecutable("vsearch")], targets=[stitched_output, unjoined_output], name="vsearch_fastq_mergepairs") else: workflow.add_task( utilities.partial_function(merge_pairs, method="userach", threads=threads, fastq_ascii=fastq_ascii), depends=[read1, read2, TrackedExecutable("usearch")], targets=[stitched_output, unjoined_output], name="usearch_fastq_mergepairs") # merge the stitched and unjoined from the prior step renamed_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="renamed", extension="fastq") workflow.add_task_group( "merge_and_rename_fastq.py [depends[0]] [depends[1]] _stitched [targets[0]]", depends=zip(stitched_files, unjoined_files), targets=renamed_files) else: # these files are not pairs and do not need to be merged # rename the files renamed_files = utilities.name_files(input_files, output_folder, subfolder="merged_renamed", tag="renamed", extension="fastq", create_folder=True) workflow.add_task_group( "merge_and_rename_fastq.py [depends[0]] '' '' [targets[0]]", depends=input_files, targets=renamed_files) return renamed_files
def taxonomic_profile(workflow, method, filtered_fasta_file, truncated_fasta_file, original_fasta_file, output_folder, threads, percent_identity, reference_usearch, reference_fasta, reference_taxonomy, min_size, bypass_msa=False): """ Pick otus, cluster centroids, otu mapping, reference mapping to create open/closed reference taxonomy files Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis - usearhc(default) or vsearch filtered_fasta_file (string): The path to the fasta file (filtered and dereplicated). truncated_fasta_file (string): The path to the fasta file (truncated not qced). original_fasta_file (string): The path to the fasta file (not qc or truncated). output_folder (string): The path of the output folder. threads (int): The number of threads for each task. percent_identity (float): The percent identity to use for alignments. reference_usearch (string): The path to the reference usearch formatted database. reference_fasta (string): The path to the reference fasta file. reference_taxonomy (string): The path to the reference taxonomy file. min_size (int): Min size of the reads to filter. bypass_msa (bool): Bypass msa clustering and tree generation. Requires: usearch(as of usearch v9: has built-in de novo chimera filtering) or vsearch clustal omega: multiple sequence alignment for proteins Returns: list: Path to the fasta file sorted by size """ # first pick otus otu_fasta = pick_otus(workflow, method, filtered_fasta_file, reference_fasta, output_folder, threads, min_size) # centroid OTU sequence alignment # get the name of the output files if not bypass_msa: centroid_fasta = files.SixteenS.path("msa_nonchimera", output_folder) centroid_alignment(workflow, otu_fasta, centroid_fasta, threads, task_name="clustalo_nonchimera") # align the reads to the otus otu_alignment_uc = utilities.name_files( "all_samples_otu_mapping_results.uc", output_folder) otu_alignment_tsv = utilities.name_files( "all_samples_otu_mapping_results.tsv", output_folder) global_alignment(workflow, method, truncated_fasta_file, otu_fasta, percent_identity, threads, otu_alignment_uc, otu_alignment_tsv) # align the otus to the reference database reference_alignment_uc = utilities.name_files( "all_samples_green_genes_mapping_results.uc", output_folder) reference_alignment_tsv = utilities.name_files( "all_samples_green_genes_mapping_results.tsv", output_folder) global_alignment(workflow, method, otu_fasta, reference_usearch, percent_identity, threads, reference_alignment_uc, reference_alignment_tsv, top_hit_only=True) # create the open/cosed reference tables closed_reference_tsv, closed_ref_fasta = build_otu_tables( workflow, reference_taxonomy, reference_fasta, reference_alignment_uc, otu_alignment_uc, otu_fasta, original_fasta_file, output_folder) # cluster the closed reference otu fasta sequences and generate a tree if not bypass_msa: centroid_closed_fasta = files.SixteenS.path("msa_closed_reference", output_folder) closed_tree = utilities.name_files("closed_reference.tre", output_folder) centroid_alignment(workflow, closed_ref_fasta, centroid_closed_fasta, threads, task_name="clustalo_closed_reference") create_tree(workflow, centroid_closed_fasta, closed_tree) return closed_reference_tsv, closed_ref_fasta
def demultiplex(workflow, input_files, extension, output_folder, barcode_file, index_files, min_phred, pair_identifier): """Demultiplex the files (single end or paired) Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq files for input to ea-utils. extension (string): The extension for all files. output_folder (string): The path of the output folder. barcode_file (string): A file of barcodes. index_files (string): A list of paths to the index files. min_phred (int): The min phred quality score to use in the demultiplex command. pair_identifier (string): The string in the file basename to identify the first pair in the set. Requires: ea-utils fastq-multx: A tool to demultiplex fastq files. Returns: list: A list of the demultiplexed files string: output folder of demultiplexed files """ # error if there is more than one index file if len(index_files) > 1: sys.exit("ERROR: Only one index file expected for demultiplexing step.") # read the barcode file to get the expected output files try: file_handle=open(barcode_file) lines=file_handle.readlines() file_handle.close() except EnvironmentError: sys.exit("ERROR: Unable to read barcode file: " + barcode_file) samples=set() for line in lines: # ignore headers or comment lines if not line.startswith("#"): sample_name=line.rstrip().split("\t")[0] if sample_name: samples.add(sample_name) # get the names of the expected output files demultiplex_fastq_files = utilities.name_files(samples,output_folder,subfolder="demultiplex",extension="fastq") # name the barcode file with the reverse complement barcodes added expanded_barcode_file = utilities.name_files("expanded_barcode_file.txt",output_folder,subfolder="demultiplex",create_folder=True) # create a file that includes the reverse complements of the barcodes workflow.add_task( "reverse_compliment_barcodes.py --input [depends[0]] --output [targets[0]]", depends=barcode_file, targets=expanded_barcode_file) # check for paired input files input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier) # capture the demultiplex stats in output files, one for each set of input files if input_pair1: demultiplex_log = utilities.name_files(input_pair1[0],output_folder,subfolder="demultiplex",extension="log") else: demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log") # get the output folder for all files demultiplex_output_folder = os.path.dirname(demultiplex_log) # get the basenames of the output files, one for each sample demultiplex_output_basenames = utilities.name_files(samples,output_folder,subfolder="demultiplex") # create a tracked executable fastq_multx_tracked = TrackedExecutable("fastq-multx",version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`") if input_pair1 and input_pair2: # this run has paired input files # get the second pair identifier pair_identifier2=pair_identifier.replace("1","2",1) # get the names of the expected output files demultiplex_fastq_files_R1 = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames] demultiplex_fastq_files_R2 = [file+pair_identifier2+".fastq" for file in demultiplex_output_basenames] demultiplex_fastq_files = demultiplex_fastq_files_R1+demultiplex_fastq_files_R2 if index_files: # this run has index files workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] [depends[3]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, index_files[0], input_pair1[0], input_pair2[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2], targets=demultiplex_log, name="demultiplex") else: workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, input_pair1[0], input_pair2[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2], targets=demultiplex_log, name="demultiplex") else: # this run has single end input files # get the names of the expected output files demultiplex_fastq_files = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames] if index_files: # this run has index files workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, index_files[0], input_files[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier], targets=demultiplex_log, name="demultiplex") else: workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] -o [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, input_files[0]], args=[min_phred, demultiplex_output_folder, pair_identifier, fastq_multx_tracked], targets=demultiplex_log, name="demultiplex") demultiplex_fastq_files = demultiplex_check(workflow, demultiplex_log, demultiplex_fastq_files) return demultiplex_fastq_files, demultiplex_output_folder
workflow.add_argument("max-strains", desc="the max number of strains to profile", default=20, type=int) # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line # return an error if no files are found input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) ### STEP #1: Run taxonomic profiling on all of the filtered files ### if not args.bypass_taxonomic_profiling: merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow, input_files,args.output,args.threads,args.input_extension) elif: sample_names = utilities.sample_names(input_files,args.input_extension) tsv_profiles = utilities.name_files(sample_names, demultiplex_output_folder, tag="taxonomic_profile", extension="tsv") # check all of the expected profiles are found if len(tsv_profiles) != len(list(filter(os.path.isfile,tsv_profiles))): sys.exit("ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n"+"\n".join(tsv_profiles)) # run taxonomic profile steps bypassing metaphlan2 merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow, tsv_profiles,args.output,args.threads,"tsv",already_profiled=True) # look for the sam profiles taxonomy_sam_files = utilities.name_files(sample_names, demultiplex_output_folder, tag="bowtie2", extension="sam") # if they do not all exist, then bypass strain profiling if not already set if len(taxonomy_sam_files) != len(list(filter(os.path.isfile,taxonomy_sam_files))): print("Warning: Bypassing taxonomic profiling but not all taxonomy sam files are present in the input folder. Strain profiling will be bypassed. Expecting the following input files:\n"+"\n".join(taxonomy_sam_files)) args.bypass_strain_profiling = True ### STEP #2: Run strain profiling # Provide taxonomic profiling output so top strains by abundance will be selected
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='MGX') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') contaminate_db = conf.get('databases').get('knead_dna') if data_files and data_files.get('MGX'): input_files = data_files.get('MGX').get('input') pair_identifier = data_files.get('MGX').get('pair_identifier') file_extension = data_files.get('MGX', {}).get('input_extension', '.fastq') sample_names = get_sample_names(input_files, file_extension) project_dirs = create_project_dirs([ conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir') ], project, creation_date, 'WGS') (deposition_dir, processing_dir, public_dir) = project_dirs base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) deposited_files = stage_files(workflow, input_files, deposition_dir, symlink=True) if file_extension == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files, processing_dir, paired_end=True, compress=False, threads=args.threads) pair_identifier = "_R1" else: paired_end_seqs = input_files qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads (cleaned_fastqs, read_counts) = quality_control(workflow, paired_end_seqs, '.fastq', processing_dir, qc_threads, contaminate_db, pair_identifier=pair_identifier, remove_intermediate_output=True) ## Generate taxonomic profile output. Output are stored in a list ## and are the following: ## ## * Merged taxonomic profile ## * Individual taxonomic files ## * metaphlan2 SAM files tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads tax_profile_outputs = taxonomic_profile(workflow, cleaned_fastqs, processing_dir, tax_threads, '.fastq') ## Generate functional profile output using humann2. Outputs are the ## the following: ## ## * Merged normalized genefamilies ## * Merged normalized ecs ## * Merged normalized pathways ## * Merged genefamilies ## * Merged ecs ## * Merged pathways func_threads = args.threads_humann if args.threads_humann else args.threads func_profile_outputs = functional_profile( workflow, cleaned_fastqs, '.fastq', processing_dir, func_threads, tax_profile_outputs[1], remove_intermediate_output=True) ## The current biobakery workflows do not generate KO's from our genefamilies ## so we're going to want to do that ourselves. genefamilies = name_files(sample_names, os.path.join(processing_dir, 'metaphlan2'), subfolder='main', tag='genefamilies', extension='tsv') pathways = name_files(sample_names, os.path.join(processing_dir, 'humann2'), subfolder='main', tag='pathabundance', extension='tsv') ecs = name_files(sample_names, os.path.join(processing_dir, 'humann2'), subfolder='regrouped', tag='ecs', extension='tsv') kos = name_files(sample_names, os.path.join(processing_dir, 'humann2'), subfolder='regrouped', tag='kos', extension='tsv') #(merged_norm_kos, merged_kos) = generate_ko_files(workflow, # genefamilies, # processing_dir) biom_files = batch_convert_tsv_to_biom(workflow, tax_profile_outputs[1]) tax_biom_files = stage_files(workflow, biom_files, processing_dir) kneaddata_log_files = name_files(sample_names, os.path.join(processing_dir, 'kneaddata'), subfolder='main', extension='log') pub_raw_dir = os.path.join(public_dir, 'raw') pub_tax_profile_dir = os.path.join(public_dir, 'tax_profile') pub_func_profile_dir = os.path.join(public_dir, 'func_profile') map(create_folders, [pub_raw_dir, pub_tax_profile_dir, pub_func_profile_dir]) knead_read_counts = os.path.join(processing_dir, 'counts', 'merged', 'kneaddata_read_count_table.tsv') tax_profile_pcl = add_metadata_to_tsv( workflow, [tax_profile_outputs[0]], args.metadata_file, 'metagenomics', id_col=conf.get('metadata_id_col'), col_replace=conf.get('analysis_col_patterns'), target_cols=conf.get('target_metadata_cols'), aux_files=[knead_read_counts]) func_profile_pcl = add_metadata_to_tsv( workflow, [func_profile_outputs[0]], args.metadata_file, 'metagenomics', id_col=conf.get('metadata_id_col'), col_replace=conf.get('analysis_col_patterns'), target_cols=conf.get('target_metadata_cols'), aux_files=[knead_read_counts]) pub_files = [ stage_files(workflow, files, target_dir) for (files, target_dir) in [(cleaned_fastqs, pub_raw_dir), ([tax_profile_outputs[0]], pub_tax_profile_dir), (tax_profile_outputs[1], pub_tax_profile_dir), ( tax_biom_files, pub_tax_profile_dir), (tax_profile_pcl, pub_tax_profile_dir), (func_profile_outputs, pub_func_profile_dir), ( func_profile_pcl, pub_func_profile_dir), (kneaddata_log_files, pub_raw_dir)] ] norm_genefamilies = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='ecs', tag='ecs_relab', extension='tsv') norm_path_files = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='pathways', tag='pathabundance_relab', extension='tsv') norm_kos_files = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='kos_relab', extension='tsv') func_tar_files = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names, norm_genefamilies, norm_ecs_files, norm_path_files): tar_path = os.path.join(pub_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_profile_outputs) func_tar_files.append(func_tar_file) workflow.go()