from anadama2 import Workflow workflow = Workflow(remove_options=["input", "output"]) downloads = [ "ftp://public-ftp.hmpdacc.org/HM16STR/by_sample/SRS011275.fsa.gz", "ftp://public-ftp.hmpdacc.org/HM16STR/by_sample/SRS011273.fsa.gz", "ftp://public-ftp.hmpdacc.org/HM16STR/by_sample/SRS011180.fsa.gz" ] for link in downloads: workflow.add_task("wget -O [targets[0]] [args[0]]", targets=link.split("/")[-1], args=link) workflow.go()
from anadama2 import Workflow workflow = Workflow(remove_options=["input", "output"]) # add a task to download the file workflow.add_task( "wget ftp://public-ftp.hmpdacc.org/HMMCP/finalData/hmp1.v35.hq.otu.counts.bz2 -O [targets[0]]", targets="hmp1.v35.hq.otu.counts.bz2") # add a task to decompress the file workflow.add_task("bzip2 -d < [depends[0]] > [targets[0]]", depends="hmp1.v35.hq.otu.counts.bz2", targets="hmp1.v35.hq.otu.counts") def remove_end_tabs_function(task): with open(task.targets[0].name, 'w') as file_handle_out: for line in open(task.depends[0].name): file_handle_out.write(line.rstrip() + "\n") # add a task with a function to remove the end tabs from the file workflow.add_task(remove_end_tabs_function, depends="hmp1.v35.hq.otu.counts", targets="hmp1.v35.hq.otu.counts.notabs", name="remove_end_tabs") workflow.go()
# Parsing the workflow arguments args = workflow.parse_args() #Loading the config setting args.config = 'etc/config.ini' # AnADAMA2 example workflow.do workflow.do("ls /usr/bin/ | sort > [t:output/global_exe.txt]") #Command workflow.do("ls $HOME/.local/bin/ | sort > [t:output/local_exe.txt]") #Command # Task0 sample python analysis module - src/trim.py workflow.add_task( "src/trim.py --lines [args[0]] --output [targets[0]] --input " + args.input, #Command depends=[TrackedExecutable("src/trim.py") ], #Tracking executable dependencies targets=args.output, #Output target directory args=[args.lines]) #Additional arguments # Task1 sample python visualization module - src/plot.py workflow.add_task( "src/plot.py --output [targets[0]] --input " + args.input, #Command depends=[TrackedExecutable("src/plot.py") ], #Tracking executable dependencies targets=args.output) #Output target directory # Task2 sample R module - src/analysis_example.r workflow.add_task( "src/analysis.R -o [targets[0]] -d " + args.metadata, #Command depends=[TrackedExecutable("src/analysis.R")
name + ".trimmed.1.fastq", name + ".trimmed.2.fastq", name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq", name + ".trimmed.single.12.fastq" ], args.output, subfolder="kneaddata", create_folder=True) for name in sample_names ] paired = True for target_set, input_R1, input_R2, name in zip(qc_targets, input_pair1, input_pair2, sample_names): workflow.add_task( "kneaddata --run-fastqc-start --input [depends[0]] --input [depends[1]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]] && cat [args[3]] [args[4]] > [targets[2]]", depends=[input_R1, input_R2, TrackedExecutable("kneaddata")], targets=[target_set[0], target_set[1], target_set[4]], args=[ os.path.dirname(target_set[0]), args.threads, name, target_set[2], target_set[3] ]) else: qc_targets = utilities.name_files(sample_names, args.output, subfolder="kneaddata", create_folder=True, extension="trimmed.fastq") for target_file, input_file, name in zip(qc_targets, input_files, sample_names): workflow.add_task( "kneaddata --run-fastqc-start --input [depends[0]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]]", depends=[input_file, TrackedExecutable("kneaddata")],
import anadama2.tracked from anadama2 import Workflow workflow = Workflow(remove_options=["input","output"]) # create a container class to track container = anadama2.tracked.Container(a = 20) # add a task that depends on the "a" variable in the container task1=workflow.add_task( "echo [depends[0]] > [targets[0]]", depends=container.a, targets="echo.txt", name="task1") # add a task that depends on the targets of task1 task2=workflow.add_task( "p=$(cat [depends[0]]); echo $p > [targets[0]]", depends=task1.targets[0], targets="echo2.txt", name="task2") workflow.go()
required=True) args = workflow.parse_args() # get all of the input files input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) sample_names = utilities.sample_names(input_files, args.input_extension) # for each raw input file, generate an md5sum file md5sum_outputs = [ os.path.join(args.output, output_file_name) + ".md5sum" for output_file_name in sample_names ] workflow.add_task_group("md5sum [depends[0]] > [targets[0]]", depends=input_files, targets=md5sum_outputs) # for each file, verify the checksum md5sum_checks = [ os.path.join(args.output, check_file_name) + ".check" for check_file_name in sample_names ] for in_file, sum_file, check_file in zip(input_files, md5sum_outputs, md5sum_checks): workflow.add_task(verify_checksum, depends=[in_file, sum_file, args.input_metadata], targets=[check_file]) workflow.go()
"input-extensions", desc="the comma-delimited list of extensions of the input files", default="txt,tsv,fastq,fastq.gz,log,sam") args = workflow.parse_args() # get all of the files in the input folder with the extensions provided def get_files_to_add(input_folder): posible_extensions = set(args.input_extensions.split(",")) input_files = [] for folder, directories, files in os.walk(input_folder): if not ".anadama" in folder: for filename in files: if any( map(lambda ext: filename.endswith(ext), posible_extensions)): input_files.append(os.path.join(folder, filename)) return input_files # get the files to add from the input and output folder input_files = get_files_to_add(args.input) output_files = get_files_to_add(args.output) for filename in input_files + output_files: workflow.add_task("echo 'Adding file [depends[0]]'", depends=filename, targets=filename) workflow.go()
db = pickle.load(bz2.BZ2File(args.pkl_database, 'r')) marker_to_species={} for marker,info in db['markers'].items(): if info['clade'] in species_list: marker_to_species[marker]=info['clade'] # read in the sam file and pull out the reads that align with the markers with open(task.targets[0].name, "w") as file_handle_write: with open(task.depends[0].name) as file_handle: for line in file_handle: if not line.startswith("@"): data=line.rstrip().split("\t") reference=data[SAM_REFERENCE_NAME_INDEX] if reference in marker_to_species.keys(): seq_id = ";".join([data[SAM_READ_NAME_INDEX],marker_to_species[reference]]) seq = data[SAM_SEQ_INDEX] file_handle_write.write("\n".join([">"+seq_id,seq])+"\n") # for each of the input files write the fasta file of reads for infile in workflow.get_input_files(extension=args.input_tag_extension): outfile = workflow.name_output_files(infile).replace(args.input_tag_extension,"_metaphlan2_marker_aligned_subset.fasta") workflow.add_task( find_reads, depends=infile, targets=outfile) workflow.go()
required=True) workflow.add_argument("count-script", desc="the script to update the data stats", required=True) args = workflow.parse_args() # archive the raw input files date = datetime.datetime.now() study_folder = args.study + "_" + str(date.month) + "_" + str( date.day) + "_" + str(date.year) archive_folder = os.path.join(args.output, study_folder) upload_archive = archive_folder + "_uploaded" utilities.create_folders(upload_archive) task1 = workflow.add_task("mv --backup [args[0]]/* [args[1]]/", args=[args.input_upload, upload_archive]) # archive the processed files process_archive = archive_folder + "_processed" utilities.create_folders(process_archive) task2 = workflow.add_task("mv --backup [args[0]]/* [args[1]]/", depends=task1, args=[args.input_processed, process_archive]) task3 = workflow.add_task("cp [args[0]]/metadata/metadata*.tsv [args[1]]/", depends=task2, args=[upload_archive, process_archive]) task4 = workflow.add_task( "rsync --ignore-existing [args[0]]/metadata/MANIFEST [args[1]] && rm [args[0]]/metadata/MANIFEST",