Beispiel #1
0
                      desc="number of threads for panphlan to use")
workflow.add_argument("dbfolder",
                      default=None,
                      desc="folder containing database")
workflow.add_argument("filesfile",
                      default=None,
                      desc="file with filepaths to run on (relative to input)")
workflow.add_argument("ref", default=None, desc="name of reference db")
workflow.add_argument(
    "refs",
    default=None,
    desc="file with list of references (relative to dbfolder)")

args = workflow.parse_args()

in_files = workflow.get_input_files(".fastq.gz")
out_files = workflow.name_output_files(name=in_files,
                                       tag="panphlan_map",
                                       extension="csv.bz2")

if args.filesfile:
    with open(args.filesfile) as f:
        in_files = [l.strip() for l in f]

if args.dbfolder:
    cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads] --i_bowtie2_indexes [db]"
else:
    cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads]"

if args.ref:
    refs = [args.ref]
# create a workflow instance, providing the version number and description
# the version number will appear when running this script with the "--version" option
# the description will appear when running this script with the "--help" option
workflow = Workflow(version="0.1", description="A workflow to run KneadData")

# add the custom arguments to the workflow
workflow.add_argument("kneaddata-db", desc="the kneaddata database", default="/work/code/kneaddata/db/")
workflow.add_argument("input-extension", desc="the input file extension", default="fastq")
workflow.add_argument("threads", desc="number of threads for knead_data to use", default=1)

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
in_files = workflow.get_input_files(extension=args.input_extension)

# get a list of output files, one for each input file, with the kneaddata tag
out_files = workflow.name_output_files(name=in_files, tag="kneaddata")

# create a task for each set of input and output files to run kneaddata
workflow.add_task_group(
    "kneaddata --input [depends[0]] --output [output_folder] --reference-db [kneaddata_db] --threads [threads]",
    depends=in_files,
    targets=out_files,
    output_folder=args.output,
    kneaddata_db=args.kneaddata_db,
    threads=args.threads)

workflow.go()
    db = pickle.load(bz2.BZ2File(args.pkl_database, 'r'))

    marker_to_species={}
    for marker,info in db['markers'].items():
        if info['clade'] in species_list:
            marker_to_species[marker]=info['clade']

    # read in the sam file and pull out the reads that align with the markers
    with open(task.targets[0].name, "w") as file_handle_write:
        with open(task.depends[0].name) as file_handle:
            for line in file_handle:
                if not line.startswith("@"):
                    data=line.rstrip().split("\t")
                    reference=data[SAM_REFERENCE_NAME_INDEX]
                    if reference in marker_to_species.keys():
                        seq_id = ";".join([data[SAM_READ_NAME_INDEX],marker_to_species[reference]])
                        seq = data[SAM_SEQ_INDEX]
                        file_handle_write.write("\n".join([">"+seq_id,seq])+"\n")

# for each of the input files write the fasta file of reads
for infile in workflow.get_input_files(extension=args.input_tag_extension):
    outfile = workflow.name_output_files(infile).replace(args.input_tag_extension,"_metaphlan2_marker_aligned_subset.fasta")
    workflow.add_task(
        find_reads,
        depends=infile,
        targets=outfile)

workflow.go()