# create a workflow instance, providing the version number and description # the version number will appear when running this script with the "--version" option # the description will appear when running this script with the "--help" option workflow = Workflow(version="0.1", description="A workflow to run KneadData") # add the custom arguments to the workflow workflow.add_argument("kneaddata-db", desc="the kneaddata database", default="/work/code/kneaddata/db/") workflow.add_argument("input-extension", desc="the input file extension", default="fastq") workflow.add_argument("threads", desc="number of threads for knead_data to use", default=1) # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line in_files = workflow.get_input_files(extension=args.input_extension) # get a list of output files, one for each input file, with the kneaddata tag out_files = workflow.name_output_files(name=in_files, tag="kneaddata") # create a task for each set of input and output files to run kneaddata workflow.add_task_group( "kneaddata --input [depends[0]] --output [output_folder] --reference-db [kneaddata_db] --threads [threads]", depends=in_files, targets=out_files, output_folder=args.output, kneaddata_db=args.kneaddata_db, threads=args.threads) workflow.go()
"metadata":metadata, "metadata_labels":metadata_labels, "picard":args.input_picard, "picard_ext":args.input_picard_extension} # listing all expected input files input_desc+=files.SixteenS.list_file_path_description("",input_files) if not args.exclude_workflow_info: templates += [utilities.get_package_file("workflow_info")] # add the document to the workflow doc_task=workflow.add_document( templates=templates, depends=methoddepends, targets=workflow.name_output_files("16S_report."+args.format), vars=methodvars, table_of_contents=True) # add an archive of the document and figures, removing the log file # the archive will have the same name and location as the output folder workflow.add_archive( depends=[args.output,doc_task], targets=args.output+".zip", remove_log=True) # start the workflow workflow.go()
utilities.get_package_file("quality_control_paired_dna_rna"), utilities.get_package_file("taxonomy"), utilities.get_package_file("functional_dna_rna") ] # add the template for the data processing information log_file = None if not args.exclude_workflow_info: templates += [utilities.get_package_file("workflow_info")] log_file = files.Workflow.path("log", args.input, error_if_not_found=True) # add the document to the workflow doc_task = workflow.add_document( templates=templates, depends=[wmgx_qc_counts, wmtx_qc_counts, taxonomic_profile, pathabundance], targets=workflow.name_output_files("wmgx_wmtx_report." + args.format), vars={ "title": "Metagenome and Metatranscriptome Report", "project": args.project_name, "introduction_text": args.introduction_text, "dna_read_counts": wmgx_qc_counts, "rna_read_counts": wmtx_qc_counts, "dna_aligned_read_counts": files.ShotGun.path("humann2_read_counts", wmgx_input_folder, none_if_not_found=True),
default=None, desc="folder containing database") workflow.add_argument("filesfile", default=None, desc="file with filepaths to run on (relative to input)") workflow.add_argument("ref", default=None, desc="name of reference db") workflow.add_argument( "refs", default=None, desc="file with list of references (relative to dbfolder)") args = workflow.parse_args() in_files = workflow.get_input_files(".fastq.gz") out_files = workflow.name_output_files(name=in_files, tag="panphlan_map", extension="csv.bz2") if args.filesfile: with open(args.filesfile) as f: in_files = [l.strip() for l in f] if args.dbfolder: cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads] --i_bowtie2_indexes [db]" else: cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads]" if args.ref: refs = [args.ref] elif args.refs: r = open(args.refs, "r")
db = pickle.load(bz2.BZ2File(args.pkl_database, 'r')) marker_to_species={} for marker,info in db['markers'].items(): if info['clade'] in species_list: marker_to_species[marker]=info['clade'] # read in the sam file and pull out the reads that align with the markers with open(task.targets[0].name, "w") as file_handle_write: with open(task.depends[0].name) as file_handle: for line in file_handle: if not line.startswith("@"): data=line.rstrip().split("\t") reference=data[SAM_REFERENCE_NAME_INDEX] if reference in marker_to_species.keys(): seq_id = ";".join([data[SAM_READ_NAME_INDEX],marker_to_species[reference]]) seq = data[SAM_SEQ_INDEX] file_handle_write.write("\n".join([">"+seq_id,seq])+"\n") # for each of the input files write the fasta file of reads for infile in workflow.get_input_files(extension=args.input_tag_extension): outfile = workflow.name_output_files(infile).replace(args.input_tag_extension,"_metaphlan2_marker_aligned_subset.fasta") workflow.add_task( find_reads, depends=infile, targets=outfile) workflow.go()