Beispiel #1
0
from anadama2 import Workflow

workflow = Workflow(remove_options=["input", "output"])

downloads = [
    "ftp://public-ftp.hmpdacc.org/HM16STR/by_sample/SRS011275.fsa.gz",
    "ftp://public-ftp.hmpdacc.org/HM16STR/by_sample/SRS011273.fsa.gz",
    "ftp://public-ftp.hmpdacc.org/HM16STR/by_sample/SRS011180.fsa.gz"
]

for link in downloads:
    workflow.add_task("wget -O [targets[0]] [args[0]]",
                      targets=link.split("/")[-1],
                      args=link)

workflow.go()
Beispiel #2
0
from anadama2 import Workflow

workflow = Workflow(remove_options=["input", "output"])

# add a task to download the file
workflow.add_task(
    "wget ftp://public-ftp.hmpdacc.org/HMMCP/finalData/hmp1.v35.hq.otu.counts.bz2 -O [targets[0]]",
    targets="hmp1.v35.hq.otu.counts.bz2")

# add a task to decompress the file
workflow.add_task("bzip2 -d < [depends[0]] > [targets[0]]",
                  depends="hmp1.v35.hq.otu.counts.bz2",
                  targets="hmp1.v35.hq.otu.counts")


def remove_end_tabs_function(task):
    with open(task.targets[0].name, 'w') as file_handle_out:
        for line in open(task.depends[0].name):
            file_handle_out.write(line.rstrip() + "\n")


# add a task with a function to remove the end tabs from the file
workflow.add_task(remove_end_tabs_function,
                  depends="hmp1.v35.hq.otu.counts",
                  targets="hmp1.v35.hq.otu.counts.notabs",
                  name="remove_end_tabs")

workflow.go()
# Parsing the workflow arguments
args = workflow.parse_args()

#Loading the config setting
args.config = 'etc/config.ini'

# AnADAMA2 example workflow.do
workflow.do("ls /usr/bin/ | sort > [t:output/global_exe.txt]")  #Command
workflow.do("ls $HOME/.local/bin/ | sort > [t:output/local_exe.txt]")  #Command

# Task0 sample python analysis module  - src/trim.py
workflow.add_task(
    "src/trim.py --lines [args[0]] --output [targets[0]] --input " +
    args.input,  #Command 
    depends=[TrackedExecutable("src/trim.py")
             ],  #Tracking executable dependencies
    targets=args.output,  #Output target directory
    args=[args.lines])  #Additional arguments

# Task1 sample python visualization module - src/plot.py
workflow.add_task(
    "src/plot.py --output [targets[0]] --input " + args.input,  #Command 
    depends=[TrackedExecutable("src/plot.py")
             ],  #Tracking executable dependencies
    targets=args.output)  #Output target directory

# Task2 sample R module  - src/analysis_example.r
workflow.add_task(
    "src/analysis.R -o [targets[0]] -d " + args.metadata,  #Command 
    depends=[TrackedExecutable("src/analysis.R")
            name + ".trimmed.1.fastq", name + ".trimmed.2.fastq",
            name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq",
            name + ".trimmed.single.12.fastq"
        ],
                             args.output,
                             subfolder="kneaddata",
                             create_folder=True) for name in sample_names
    ]
    paired = True
    for target_set, input_R1, input_R2, name in zip(qc_targets, input_pair1,
                                                    input_pair2, sample_names):
        workflow.add_task(
            "kneaddata --run-fastqc-start --input [depends[0]] --input [depends[1]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]] && cat [args[3]] [args[4]] > [targets[2]]",
            depends=[input_R1, input_R2,
                     TrackedExecutable("kneaddata")],
            targets=[target_set[0], target_set[1], target_set[4]],
            args=[
                os.path.dirname(target_set[0]), args.threads, name,
                target_set[2], target_set[3]
            ])
else:
    qc_targets = utilities.name_files(sample_names,
                                      args.output,
                                      subfolder="kneaddata",
                                      create_folder=True,
                                      extension="trimmed.fastq")
    for target_file, input_file, name in zip(qc_targets, input_files,
                                             sample_names):
        workflow.add_task(
            "kneaddata --run-fastqc-start --input [depends[0]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]]",
            depends=[input_file, TrackedExecutable("kneaddata")],
Beispiel #5
0
import anadama2.tracked
from anadama2 import Workflow

workflow = Workflow(remove_options=["input","output"])

# create a container class to track
container = anadama2.tracked.Container(a = 20)

# add a task that depends on the "a" variable in the container
task1=workflow.add_task(
    "echo [depends[0]] > [targets[0]]",
    depends=container.a, 
    targets="echo.txt",
    name="task1")

# add a task that depends on the targets of task1 
task2=workflow.add_task(
    "p=$(cat [depends[0]]); echo $p > [targets[0]]",
    depends=task1.targets[0],
    targets="echo2.txt",
    name="task2")

workflow.go()

Beispiel #6
0
                      required=True)
args = workflow.parse_args()

# get all of the input files
input_files = utilities.find_files(args.input,
                                   extension=args.input_extension,
                                   exit_if_not_found=True)
sample_names = utilities.sample_names(input_files, args.input_extension)

# for each raw input file, generate an md5sum file
md5sum_outputs = [
    os.path.join(args.output, output_file_name) + ".md5sum"
    for output_file_name in sample_names
]
workflow.add_task_group("md5sum [depends[0]] > [targets[0]]",
                        depends=input_files,
                        targets=md5sum_outputs)

# for each file, verify the checksum
md5sum_checks = [
    os.path.join(args.output, check_file_name) + ".check"
    for check_file_name in sample_names
]
for in_file, sum_file, check_file in zip(input_files, md5sum_outputs,
                                         md5sum_checks):
    workflow.add_task(verify_checksum,
                      depends=[in_file, sum_file, args.input_metadata],
                      targets=[check_file])

workflow.go()
Beispiel #7
0
    "input-extensions",
    desc="the comma-delimited list of extensions of the input files",
    default="txt,tsv,fastq,fastq.gz,log,sam")
args = workflow.parse_args()


# get all of the files in the input folder with the extensions provided
def get_files_to_add(input_folder):
    posible_extensions = set(args.input_extensions.split(","))
    input_files = []
    for folder, directories, files in os.walk(input_folder):
        if not ".anadama" in folder:
            for filename in files:
                if any(
                        map(lambda ext: filename.endswith(ext),
                            posible_extensions)):
                    input_files.append(os.path.join(folder, filename))
    return input_files


# get the files to add from the input and output folder
input_files = get_files_to_add(args.input)
output_files = get_files_to_add(args.output)

for filename in input_files + output_files:
    workflow.add_task("echo 'Adding file [depends[0]]'",
                      depends=filename,
                      targets=filename)

workflow.go()
    db = pickle.load(bz2.BZ2File(args.pkl_database, 'r'))

    marker_to_species={}
    for marker,info in db['markers'].items():
        if info['clade'] in species_list:
            marker_to_species[marker]=info['clade']

    # read in the sam file and pull out the reads that align with the markers
    with open(task.targets[0].name, "w") as file_handle_write:
        with open(task.depends[0].name) as file_handle:
            for line in file_handle:
                if not line.startswith("@"):
                    data=line.rstrip().split("\t")
                    reference=data[SAM_REFERENCE_NAME_INDEX]
                    if reference in marker_to_species.keys():
                        seq_id = ";".join([data[SAM_READ_NAME_INDEX],marker_to_species[reference]])
                        seq = data[SAM_SEQ_INDEX]
                        file_handle_write.write("\n".join([">"+seq_id,seq])+"\n")

# for each of the input files write the fasta file of reads
for infile in workflow.get_input_files(extension=args.input_tag_extension):
    outfile = workflow.name_output_files(infile).replace(args.input_tag_extension,"_metaphlan2_marker_aligned_subset.fasta")
    workflow.add_task(
        find_reads,
        depends=infile,
        targets=outfile)

workflow.go()

Beispiel #9
0
                      required=True)
workflow.add_argument("count-script",
                      desc="the script to update the data stats",
                      required=True)
args = workflow.parse_args()

# archive the raw input files
date = datetime.datetime.now()
study_folder = args.study + "_" + str(date.month) + "_" + str(
    date.day) + "_" + str(date.year)
archive_folder = os.path.join(args.output, study_folder)

upload_archive = archive_folder + "_uploaded"
utilities.create_folders(upload_archive)

task1 = workflow.add_task("mv --backup [args[0]]/* [args[1]]/",
                          args=[args.input_upload, upload_archive])

# archive the processed files
process_archive = archive_folder + "_processed"
utilities.create_folders(process_archive)

task2 = workflow.add_task("mv --backup [args[0]]/* [args[1]]/",
                          depends=task1,
                          args=[args.input_processed, process_archive])

task3 = workflow.add_task("cp [args[0]]/metadata/metadata*.tsv [args[1]]/",
                          depends=task2,
                          args=[upload_archive, process_archive])

task4 = workflow.add_task(
    "rsync --ignore-existing [args[0]]/metadata/MANIFEST [args[1]] && rm [args[0]]/metadata/MANIFEST",