# create a workflow instance, providing the version number and description
# the version number will appear when running this script with the "--version" option
# the description will appear when running this script with the "--help" option
workflow = Workflow(version="0.1", description="A workflow to run KneadData")

# add the custom arguments to the workflow
workflow.add_argument("kneaddata-db", desc="the kneaddata database", default="/work/code/kneaddata/db/")
workflow.add_argument("input-extension", desc="the input file extension", default="fastq")
workflow.add_argument("threads", desc="number of threads for knead_data to use", default=1)

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
in_files = workflow.get_input_files(extension=args.input_extension)

# get a list of output files, one for each input file, with the kneaddata tag
out_files = workflow.name_output_files(name=in_files, tag="kneaddata")

# create a task for each set of input and output files to run kneaddata
workflow.add_task_group(
    "kneaddata --input [depends[0]] --output [output_folder] --reference-db [kneaddata_db] --threads [threads]",
    depends=in_files,
    targets=out_files,
    output_folder=args.output,
    kneaddata_db=args.kneaddata_db,
    threads=args.threads)

workflow.go()
          "metadata":metadata,
          "metadata_labels":metadata_labels,
          "picard":args.input_picard,
          "picard_ext":args.input_picard_extension}


# listing all expected input files
input_desc+=files.SixteenS.list_file_path_description("",input_files)


if not args.exclude_workflow_info:
    templates += [utilities.get_package_file("workflow_info")]


# add the document to the workflow
doc_task=workflow.add_document(
    templates=templates,
    depends=methoddepends, 
    targets=workflow.name_output_files("16S_report."+args.format),
    vars=methodvars,
    table_of_contents=True)

# add an archive of the document and figures, removing the log file
# the archive will have the same name and location as the output folder
workflow.add_archive(
    depends=[args.output,doc_task],
    targets=args.output+".zip",
    remove_log=True)

# start the workflow
workflow.go()
    utilities.get_package_file("quality_control_paired_dna_rna"),
    utilities.get_package_file("taxonomy"),
    utilities.get_package_file("functional_dna_rna")
]

# add the template for the data processing information
log_file = None
if not args.exclude_workflow_info:
    templates += [utilities.get_package_file("workflow_info")]
    log_file = files.Workflow.path("log", args.input, error_if_not_found=True)

# add the document to the workflow
doc_task = workflow.add_document(
    templates=templates,
    depends=[wmgx_qc_counts, wmtx_qc_counts, taxonomic_profile, pathabundance],
    targets=workflow.name_output_files("wmgx_wmtx_report." + args.format),
    vars={
        "title":
        "Metagenome and Metatranscriptome Report",
        "project":
        args.project_name,
        "introduction_text":
        args.introduction_text,
        "dna_read_counts":
        wmgx_qc_counts,
        "rna_read_counts":
        wmtx_qc_counts,
        "dna_aligned_read_counts":
        files.ShotGun.path("humann2_read_counts",
                           wmgx_input_folder,
                           none_if_not_found=True),
Beispiel #4
0
                      default=None,
                      desc="folder containing database")
workflow.add_argument("filesfile",
                      default=None,
                      desc="file with filepaths to run on (relative to input)")
workflow.add_argument("ref", default=None, desc="name of reference db")
workflow.add_argument(
    "refs",
    default=None,
    desc="file with list of references (relative to dbfolder)")

args = workflow.parse_args()

in_files = workflow.get_input_files(".fastq.gz")
out_files = workflow.name_output_files(name=in_files,
                                       tag="panphlan_map",
                                       extension="csv.bz2")

if args.filesfile:
    with open(args.filesfile) as f:
        in_files = [l.strip() for l in f]

if args.dbfolder:
    cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads] --i_bowtie2_indexes [db]"
else:
    cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads]"

if args.ref:
    refs = [args.ref]
elif args.refs:
    r = open(args.refs, "r")
    db = pickle.load(bz2.BZ2File(args.pkl_database, 'r'))

    marker_to_species={}
    for marker,info in db['markers'].items():
        if info['clade'] in species_list:
            marker_to_species[marker]=info['clade']

    # read in the sam file and pull out the reads that align with the markers
    with open(task.targets[0].name, "w") as file_handle_write:
        with open(task.depends[0].name) as file_handle:
            for line in file_handle:
                if not line.startswith("@"):
                    data=line.rstrip().split("\t")
                    reference=data[SAM_REFERENCE_NAME_INDEX]
                    if reference in marker_to_species.keys():
                        seq_id = ";".join([data[SAM_READ_NAME_INDEX],marker_to_species[reference]])
                        seq = data[SAM_SEQ_INDEX]
                        file_handle_write.write("\n".join([">"+seq_id,seq])+"\n")

# for each of the input files write the fasta file of reads
for infile in workflow.get_input_files(extension=args.input_tag_extension):
    outfile = workflow.name_output_files(infile).replace(args.input_tag_extension,"_metaphlan2_marker_aligned_subset.fasta")
    workflow.add_task(
        find_reads,
        depends=infile,
        targets=outfile)

workflow.go()