def sort_by_size(workflow, method, fasta_file, output_folder, min_size):
    """ Sort reads by size, removing those that are not of min size
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fasta_file (string): The path to the fasta file (filtered and dereplicated).
        output_folder (string): The path of the output folder.
        min_size (int): Min size of the reads to filter.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the fasta file sorted by size
    """

    # get the name of the output files
    output_file = utilities.name_files("all_samples_sorted.fasta",
                                       output_folder)
    if method == "vsearch":
        workflow.add_task(
            "vsearch --sortbysize [depends[0]] --output [targets[0]] --minsize [args[0]]",
            depends=[fasta_file, TrackedExecutable("vsearch")],
            targets=output_file,
            args=min_size,
            name="vsearch_sortbysize")
    else:
        workflow.add_task(
            "usearch -sortbysize [depends[0]] -fastaout [targets[0]] -minsize [args[0]]",
            depends=[fasta_file, TrackedExecutable("usearch")],
            targets=output_file,
            args=min_size,
            name="usearch_sortbysize")

    return output_file
def filter_fastq(workflow,
                 method,
                 fastq_file,
                 output_folder,
                 threads,
                 maxee,
                 trunc_len,
                 fastq_ascii,
                 qmax=45):
    """ Filter the fastq files using the maxee value
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fastq_file (string): The path to the fastq file.
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        maxee (int): The maxee value to use for filtering.
        trunc_len (int): The value to use for max length.
        qmax (int): Max qvalue increased from the default of 43 to allow for Ion Torrent data
    Requires:
        usearch or vsearch
        
    Returns:
        string: A path to the filtered fasta file
        string: A path to the full fasta file
        
    """

    # get the name of the final merged fastq file
    fasta_filtered_file = utilities.name_files(
        "all_samples_concatenated_filtered.fasta", output_folder)
    fasta_discarded_file = utilities.name_files(
        "all_samples_concatenated_discarded.fasta", output_folder)
    if method == "vsearch":
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; " + \
            "vsearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]]",
            depends=[fastq_file, TrackedExecutable("vsearch")],
            targets=[fasta_filtered_file, fasta_discarded_file],
            args=[threads, maxee, trunc_len],
            name="vsearch_fastq_filter")
    else:
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; "+\
            "usearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]] -fastq_qmax [args[3]] -fastq_ascii [args[4]]",
            depends=[fastq_file,TrackedExecutable("usearch")],
            targets=[fasta_filtered_file, fasta_discarded_file],
            args=[threads, maxee, trunc_len, qmax, fastq_ascii],
            name="usearch_fastq_filter")

    # create a fasta file of all reads (included the discarded
    fasta_file = utilities.name_files("all_samples_concatenated.fasta",
                                      output_folder)
    workflow.add_task("cat [depends[0]] [depends[1]] > [targets[0]]",
                      depends=[fasta_filtered_file, fasta_discarded_file],
                      targets=fasta_file)

    return fasta_filtered_file, fasta_file
def global_alignment(workflow,
                     method,
                     fasta_file,
                     database_file,
                     id,
                     threads,
                     output_file_uc,
                     output_file_tsv,
                     top_hit_only=None):
    """ Run global alignment with the database provided 
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearch(default) or vsearch
        fasta_file (string): The path to the fasta file (filtered and dereplicated).
        database_file (string): Path to the database file (fasta or usearch format)
        id (float): The percent identity for alignment
        threads (int): The number of threads/cores for each task
        output_file_uc (string): The name for the uc output file 
        output_file_tsv (string): The name for the tsv output file 
        top_hit_only (bool): If set, only get the top hits.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the mapping results files

    """

    optional_flags = ""

    # remove existing output file if already exists as clustalo will not overwrite
    if method == "vsearch":
        if top_hit_only:
            optional_flags = " -top_hits_only"
        workflow.add_task_gridable(
            "export OMP_NUM_THREADS=[args[0]]; " + \
            "vsearch -usearch_global [depends[0]] -db [depends[1]] -strand plus -id [args[1]] -uc [targets[0]] -otutabout [targets[1]] -threads [args[0]]" + optional_flags,
            depends=[fasta_file, database_file, TrackedExecutable("vsearch")],
            targets=[output_file_uc, output_file_tsv],
            args=[threads, id],
            name="vsearch_global",
            time=60,  # 60 minutes
            mem=2 * 1024,  # 2 GB
            cores=threads)  # time/mem based on 8 cores
    else:
        if top_hit_only:
            optional_flags = " -top_hit_only"
        workflow.add_task_gridable(
            "export OMP_NUM_THREADS=[args[0]]; "+\
            "usearch -usearch_global [depends[0]] -db [depends[1]] -strand 'both' -id [args[1]] -uc [targets[0]] -otutabout [targets[1]] -threads [args[0]]"+optional_flags,
            depends=[fasta_file, database_file, TrackedExecutable("usearch")],
            targets=[output_file_uc, output_file_tsv],
            args=[threads, id],
            name="usearch_global",
            time=60, # 60 minutes
            mem=2*1024, # 2 GB
            cores=threads) # time/mem based on 8 cores
def picrust(workflow, otu_table_biom, output_folder):
    """ Runs picrust normalize, then predict
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        out_table_biom (string): The path to the biom file (closed reference otu table).
        output_folder (string): The path of the output folder.
        
    Requires:
        Picrust v1.1: Software to predict metagenome function.
        
    Returns:
        string: The path to the functional data file in biom format.
    
    """

    # normalize the otu table
    normalized_otu_table = utilities.name_files(
        "all_samples_normalize_by_copy_number.biom", output_folder)
    # first remove target file as picrust will not overwrite
    # expects biom file is json (not hdf5) format
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; "+\
        "normalize_by_copy_number.py -i [depends[0]] -o [targets[0]]",
        depends=[otu_table_biom,TrackedExecutable("normalize_by_copy_number.py")],
        targets=normalized_otu_table,
        name="normalize_by_copy_number.py")

    # predict metagenomes
    predict_metagenomes_table = utilities.name_files(
        "all_samples_predict_metagenomes.biom", output_folder)
    # first remove target file as picrust will not overwrite
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; "+\
        "predict_metagenomes.py -i [depends[0]] -o [targets[0]]",
        depends=[normalized_otu_table,TrackedExecutable("predict_metagenomes.py")],
        targets=predict_metagenomes_table,
        name="predict_metagenomes.py")

    # categorize by function
    categorized_function_table = utilities.name_files(
        "all_samples_categorize_by_function.biom", output_folder)
    # first remove target file as picrust will not overwrite
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; " + \
        "categorize_by_function.py -i [depends[0]] -o [targets[0]] --level 3 -c KEGG_Pathways",
        depends=[predict_metagenomes_table, TrackedExecutable("categorize_by_function.py")],
        targets=categorized_function_table,
        name="categorize_by_function.py")

    return categorized_function_table, predict_metagenomes_table
def cluster_otus(workflow, method, fasta_file, reference_fasta, output_folder):
    """ Cluster the otus with usearch
    
    Args:
        workflow (anadama2.workflow): an instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fasta_file (string): the path to the fasta file (filtered and dereplicated).
        reference_fasta (string): the path to reference fasta db
        output_folder (string): the path of the output folder.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the fasta file sorted by size

    """

    # get the name of the output files
    output_fasta = utilities.name_files("all_samples_otus_nonchimeras.fasta",
                                        output_folder)

    if method == "vsearch":
        output_txt = utilities.name_files("all_samples_vsearch_otus.txt",
                                          output_folder)
        all_otus = utilities.name_files("all_otus.fasta", output_folder)
        workflow.add_task(
            "vsearch --cluster_size [depends[0]] --consout [targets[0]] --id 0.97 --relabel 'OTU' --uc [targets[1]]",
            depends=[fasta_file, TrackedExecutable("vsearch")],
            targets=[all_otus, output_txt],
            name="vsearch_cluster_otus")

        workflow.add_task(
            "vsearch --uchime_ref [depends[0]] --nonchimeras [targets[0]] --strand plus --db [args[0]]",
            depends=[all_otus, TrackedExecutable("vsearch")],
            targets=[output_fasta],
            args=[reference_fasta],
            name="vsearch_nochim")
    else:
        output_txt = utilities.name_files("all_samples_uparse_otus.txt",
                                          output_folder)
        workflow.add_task(
            "usearch -cluster_otus [depends[0]] -otus [targets[0]] -relabel 'OTU' -uparseout [targets[1]]",
            depends=[fasta_file, TrackedExecutable("usearch")],
            targets=[output_fasta, output_txt],
            name="usearch_cluster_otus")

    return output_fasta
def centroid_alignment(workflow,
                       fasta_file,
                       output_fasta,
                       threads,
                       task_name=None):
    """ Run clustalo for centroid alignment
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        fasta_file (string): The path to the fasta file (otu sequences).
        output_fasta (string): The path of the output file.
        threads (int): The number of threads/cores for each task.
        task_name (string): The custom name of the task.
        
    Requires:
        clustal omega: multiple sequence alignment for proteins 
        
    Returns:
        string: Path to the clustered otu file

    """

    # remove existing output file if already exists as clustalo will not overwrite
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; "
        "clustalo -i [depends[0]] -o [targets[0]] --threads [args[0]]",
        depends=[
            fasta_file,
            TrackedExecutable(
                "clustalo",
                version_command="echo 'clustalo' `clustalo --version`")
        ],
        targets=output_fasta,
        args=threads,
        name=task_name if task_name else "clustalo")
def convert_from_biom_to_tsv(workflow,
                             biom_file,
                             tsv_file,
                             table_type="OTU table",
                             options=""):
    """ Convert from a biom file to a tsv file 
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        biom_file (string): The path to the biom file.
        tsv_file (string): The path to write the new tsv file.
        table_type (string): The type of table to convert
        options (string): Additional options to provide to the convert function
        
    Requires:
        Biom v2: A tool for general use formatting of biological data.
        
    """

    # first remove biom file if exists as biom will not overwrite
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; "+\
        "biom convert -i [depends[0]] -o [targets[0]] --table-type='"+table_type+"' --to-tsv "+options,
        depends=[biom_file,TrackedExecutable("biom")],
        targets=tsv_file,
        name="biom")
Esempio n. 8
0
def optional_prioritization(workflow, prioritization_conf, interested_function,
                            protein_family_ann, supervised_priority,
                            output_folder, selected_priority):
    """
	This set of tasks will run prioritization using functional annotations as optional fiters.

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		prioritization_conf: Configuration file for quantitative prioritization.
		protein_family_ann: Finalized annotation file for protein .
		supervised_priority: Supervised prioritization file.

	Requires:
		config file
		annotation and quantitative prioritization files

	Returns:
		string: the name of annotation-based prioritization file.

	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# annotation_based_prioritization tasks
		myselection = prioritization.optional_prioritization (workflow, args.prioritization_conf,
		                                                             protein_family_ann,
		                                                             supervised_priority,
		                                                             output_dir, selected_priority)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start optional_prioritization module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    # get the clustering output files
    priority_dir = output_folder
    if not os.path.exists(priority_dir):
        os.system("mkdir -p " + priority_dir)

    # run annotation-based prioritization
    mylog = re.sub(".tsv", ".log", selected_priority)
    workflow.add_task(
        "metawibele_filter_prioritization -c [depends[0]] -a [depends[1]] -p [depends[2]] -f [args[0]] -o [targets[0]] > [args[1]] 2>&1",
        depends=[
            prioritization_conf, protein_family_ann, supervised_priority,
            TrackedExecutable("metawibele_filter_prioritization")
        ],
        targets=[selected_priority],
        args=[interested_function, mylog],
        cores=1,
        name="filter_prioritization")

    return selected_priority
def quality_report(workflow,
                   method,
                   fastq_file,
                   output_folder,
                   threads,
                   qmax=45):
    """ Generate a qc report from the fastq file of all samples
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearch(default) or vsearch
        fastq_file (string): The path to the fastq file.
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        qmax (int): Max qvalue increased from the default of 43 to allow for Ion Torrent data
    Requires:
        usearch or vsearch
        
    Returns:
        string: A path to the qc report file
        
    """

    # get the name of the final merged fastq file
    qc_file = files.SixteenS.path("eestats2", output_folder)
    if method == 'vsearch':
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; " + \
            "vsearch -fastq_eestats2 [depends[0]] -output [targets[0]] -threads [args[0]]",
            depends=[fastq_file, TrackedExecutable("vsearch")],
            targets=qc_file,
            args=threads,
            name="vsearch_fastq_eestats2")
    else:
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; "+\
            "usearch -fastq_eestats2 [depends[0]] -output [targets[0]] -threads [args[0]] -fastq_qmax [args[1]]",
            depends=[fastq_file,TrackedExecutable("usearch")],
            targets=qc_file,
            args=[threads,qmax],
            name="usearch_fastq_eestats2")

    return qc_file
Esempio n. 10
0
def dereplicate(workflow, method, fasta_file, output_folder, threads):
    """ Dereplicate reads
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fasta_file (string): The path to the fasta file (filtered and dereplicated).
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the dereplicated fasta file
    """

    # get the name of the output files
    output_file = utilities.name_files("all_samples_dereplicated.fasta",
                                       output_folder)

    if method == "vsearch":
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; " + \
            "vsearch --derep_fulllength [depends[0]] --output [targets[0]] --sizein --sizeout --threads [args[0]]",
            depends=[fasta_file, TrackedExecutable("vsearch")],
            targets=output_file,
            args=threads,
            name="vsearch_derep_fulllength")
    else:
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; "+\
            "usearch -derep_fulllength [depends[0]] -fastaout [targets[0]] -sizeout -threads [args[0]]",
            depends=[fasta_file,TrackedExecutable("usearch")],
            targets=output_file,
            args=threads,
            name="usearch_derep_fulllength")

    return output_file
Esempio n. 11
0
def const_seq_table(workflow, output_folder, filtered_dir,  mergers_file_path, threads):
    
         """ Builds ASV table, removes chimeras, creates read counts at each step, and fasta file with all sequences
            
            Args:
                workflow (anadama2.workflow): an instance of the workflow class
                output_folder (string):  path to output folder
                filtered_dir (string): path to directory with filtered files
                mergers_file_path (string): path to rds file that contains merged reads
                threads (int): number of threads
                
            Requires:
                dada2, tools, seqinr r packages

            Returns:
                string: path to rds file that contains ASV data
                string: path to read counts at each step tsv file
                string: path to fasta file with all sequences
         """
         
         read_counts_steps_path = files.SixteenS.path("counts_each_step", output_folder)
         
         seqtab_file_path = os.path.join(output_folder, "seqtab_final.rds")
         seqs_fasta_path = os.path.join(output_folder, "sequences.fasta")
         readcounts_rds = "Read_counts_filt.rds"
         asv_tsv = "all_samples_SV_counts.tsv"

         script_path = utilities.get_package_file("const_seq_table", "Rscript")
         version_script = utilities.get_package_file("dada2_version", "Rscript")

         version_command = """echo 'r' `r -e 'packageVersion("dada2")' | grep -C 1 dada2`"""

         workflow.add_task(
            "[vars[0]] \
              --output_dir=[args[0]]\
              --filtered_dir=[args[1]]\
              --merged_file_path=[depends[0]]\
              --read_counts_steps_path=[targets[0]]\
              --readcounts_rds=[vars[2]]\
              --asv_tsv=[vars[3]]\
              --seqtab_file_path=[targets[1]]\
              --seqs_fasta_path=[targets[2]]\
              --threads=[vars[1]]",
            depends = [mergers_file_path,TrackedExecutable("R", version_command="echo '" +  version_script + "' `" + version_script + "`")],
            targets = [read_counts_steps_path, seqtab_file_path, seqs_fasta_path],
            args = [output_folder, filtered_dir],
            vars = [script_path, threads, readcounts_rds, asv_tsv ],
            name = "construct_sequence_table"
            )
         return seqtab_file_path, read_counts_steps_path, seqs_fasta_path
Esempio n. 12
0
def assembly(workflow, input_dir, extension, extension_paired, threads,
             output_folder, contigs):
    """
	This set of tasks will run assembly on the input files provided.

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		input_dir: The direcory path of fastq files.
		extension: The extension for all reads files, e.g. .fastq.gz
		extension_paired: The extension for paired reads, e.g. _R1.fastq.gz,_R2.fastq.gz
		threads (int): The number of threads/cores for clustering to use.
		output_folder (string): The path of the output folder.
		contigs: The summarized contig file.

	Requires:
		metahit v1.1.3: A program for assembling metagenomic sequencing reads
		fastq files

	Returns:
		string: the name of contigs file.

	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# add assembly tasks
		mycontigs  = preprocessing_tasks.assembly (workflow, input_dir, args.sample_file,
												   args.extension_paired, args.extension_orphan,
												   args.threads,
												   assembly_dir, contigs)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start assembly module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    # ================================================
    # collect sequences
    # ================================================
    pair_identifier = None
    pair_identifier2 = None
    if extension_paired:
        extension_paireds = extension_paired.split(",")
        pair_identifier = re.sub(extension, "", extension_paireds[0])
        pair_identifier2 = re.sub("1", "2", pair_identifier)
        sample_files = utilities.find_files(input_dir, extension_paireds[0],
                                            None)
        samples = utilities.sample_names(sample_files, extension_paireds[0],
                                         None)
    else:
        extension_paireds = [extension]
        sample_files = utilities.find_files(input_dir, extension, None)
        samples = utilities.sample_names(sample_files, extension, None)
    split_dir = input_dir
    assembly_dir = output_folder

    split_files = []
    contigs_list = []
    for sample in samples:
        mypair = "none"
        myorphan = "none"
        mypair_tmp = []
        for item in extension_paireds:
            if item == "none":
                continue
            myfile = os.path.join(split_dir, sample + item)
            if os.path.isfile(myfile):
                mypair_tmp.append(myfile)
            else:
                sys.exit("File not exist! " + myfile)
        if len(mypair_tmp) == 1:
            # split into paired reads files
            mypair_tmp = utilities.split_paired_reads(mypair_tmp[0], extension,
                                                      pair_identifier)
            if len(mypair_tmp) == 1:
                myorphan = mypair_tmp[0]
            if len(mypair_tmp) == 2:
                mypair = ",".join(mypair_tmp)
            if len(mypair_tmp) == 3:
                mypair = ",".join(mypair_tmp[0:2])
                myorphan = mypair_tmp[2]
        else:
            if len(mypair_tmp) == 2:
                mypair = ",".join(mypair_tmp)
            if len(mypair_tmp) == 3:
                tmp1 = []
                tmp2 = []
                for i in mypair_tmp:
                    if re.search(pair_identifier, i):
                        tmp1.append(i)
                    elif re.search(pair_identifier2, i):
                        tmp1.append(i)
                    else:
                        tmp2.append(i)
                if len(tmp1) > 0:
                    mypair = ",".join(tmp1)
                if len(tmp2) > 0:
                    myorphan = ",".join(tmp2)
        split_files.append((sample, mypair, myorphan))

        seq_base = sample
        megahit_contig_dir = os.path.join(assembly_dir, seq_base)
        megahit_contig = os.path.join(megahit_contig_dir,
                                      '%s.contigs.fa' % seq_base)
        contigs_list.append(megahit_contig)

    ## run MEGAHIT
    os.system("mkdir -p " + assembly_dir)
    for (sample, mypair, myorphan) in split_files:
        seq_base = sample
        megahit_contig_dir = os.path.join(assembly_dir, seq_base)
        megahit_contig = os.path.join(megahit_contig_dir,
                                      '%s.contigs.fa' % seq_base)

        ## MEGAHIT needs memory in a byte format so let's take care of data
        #time_equation = "24*60 if file_size('[depends[0]]') < 25 else 6*24*60" # 24 hours or more depending on file size
        #mem_equation = "32*1024 if file_size('[depends[0]]') < 25 else 3*32*1024" # 32 GB or more depending on file size
        mylog = os.path.join(assembly_dir, '%s.log' % seq_base)

        if mypair != "none":
            tmp = mypair.split(",")
            if len(tmp) == 2:  # paired reads:
                tmp = mypair.split(",")
                f_seq = tmp[0]
                r_seq = tmp[1]
                if myorphan != "none":
                    workflow.add_task_gridable(
                        "rm -rf " + megahit_contig_dir + " && " +
                        "megahit -1 [depends[0]] -2 [depends[1]] -r [args[2]] -t [args[0]] -o [args[3]] --out-prefix [args[1]] >[args[4]] 2>&1",
                        depends=[f_seq, r_seq,
                                 TrackedExecutable("megahit")],
                        targets=[megahit_contig],
                        args=[
                            threads, seq_base, myorphan, megahit_contig_dir,
                            mylog
                        ],
                        cores=threads,
                        mem=mem_equation,
                        time=time_equation,
                        name=sample + "__megahit")
                else:
                    workflow.add_task_gridable(
                        "rm -rf " + megahit_contig_dir + " && " +
                        "megahit -1 [depends[0]] -2 [depends[1]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1",
                        depends=[f_seq, r_seq,
                                 TrackedExecutable("megahit")],
                        targets=[megahit_contig],
                        args=[threads, seq_base, megahit_contig_dir, mylog],
                        cores=threads,
                        mem=mem_equation,
                        time=time_equation,
                        name=sample + "__megahit")
            else:
                workflow.add_task_gridable(
                    "rm -rf " + megahit_contig_dir + " && " +
                    "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1",
                    depends=[mypair, TrackedExecutable("megahit")],
                    targets=[megahit_contig],
                    args=[threads, seq_base, megahit_contig_dir, mylog],
                    cores=threads,
                    mem=mem_equation,
                    time=time_equation,
                    name=sample + "__megahit")
        else:
            if myorphan != "none":
                workflow.add_task_gridable(
                    "rm -rf " + megahit_contig_dir + " && " +
                    "megahit -r [depends[0]] -t [args[0]] -o [args[2]] --out-prefix [args[1]] >[args[3]] 2>&1",
                    depends=[myorphan, TrackedExecutable("megahit")],
                    targets=[megahit_contig],
                    args=[threads, seq_base, megahit_contig_dir, mylog],
                    cores=threads,
                    mem=mem_equation,
                    time=time_equation,
                    name=sample + "__megahit")

    for myfile in contigs_list:
        myname = os.path.basename(myfile)
        myfile_new = os.path.join(assembly_dir, myname)
        workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                          depends=[myfile],
                          targets=[myfile_new],
                          cores=1,
                          name="ln__" + myname)

    ## combine contigs sequences
    mylog = contigs + ".log"
    workflow.add_task(
        "metawibele_format_contig_sequences -p [args[0]] -e contigs.fa -o [targets[0]] > [args[1]] 2>&1",
        depends=utilities.add_to_list(
            contigs_list,
            TrackedExecutable("metawibele_format_contig_sequences")),
        targets=[contigs],
        args=[assembly_dir, mylog],
        cores=1,
        name="format_contig_table")

    return contigs_list
Esempio n. 13
0
def gene_catalog(workflow, complete_gene, complete_protein, input_dir,
                 extension, extension_paired, threads, prefix_gene_catalog,
                 gene_catalog, gene_catalog_nuc, gene_catalog_prot,
                 mapping_dir, gene_catalog_saf, gene_catalog_count):
    """
    This set of tasks will build gene catalogs.

    Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		complete_gene: The fasta file of gene nucleotide sequences for complete ORFs.
        complete_protein: The fasta file of protein sequences for complete ORFs.
		mapping_dir: The direcory path of mapping results.
        prefix_gene_catalog: The prefix of gene catalog file.
        gene_catalog: The gene catalog file.
        gene_catalog_nuc: The fastq file of nucleotide sequences for gene catalogs.
        gene_catalog_prot: The fastq file of protein sequences for gene catalogs.
        gene_catalog_saf: The SAF gtf file for gene catalogs.
        gene_catalog_count: The count file for gene catalogs.


    Requires:
        bowtie2 (tested with 2.3.2)
        samtools (tested with 1.5)
        featureCounts (tested with Version 1.6.2)
        the nucleotide and amino acid sequences for gene catalogs
        fastq files for each sample

    Returns:
        string: file names of gene catalogs

    Example:
        from anadama2 import Workflow
        from MetaWIBELE.characterize import characterization

        # create an anadama2 workflow instance
        workflow=Workflow()

        # add quality control tasks for the fastq files
		mygene_catalog, mycounts = preprocessing_tasks.gene_catalogs (workflow, complete_gene, complete_protein,
		                                                              mapping_dir,
		                                                              prefix_gene_catalog, gene_catalog, gene_catalog_nuc, gene_catalog_prot,
		                                                              gene_catalog_saf, gene_catalog_count)

        # run the workflow
        workflow.go()
    """

    config.logger.info("###### Start gene_catalog module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    ### run gene-catalog workflow
    mylog = gene_catalog_nuc + ".log"
    myclust = gene_catalog_nuc + ".clstr"
    workflow.add_task(
        'cd-hit-est -i [depends[0]] [args[0]] -o [targets[0]] >[args[1]] 2>&1 ',
        depends=[complete_gene, TrackedExecutable("cd-hit-est")],
        targets=[gene_catalog_nuc, myclust],
        args=[config.cd_hit_gene_opts, mylog],
        cores=threads,
        name="cd-hit-est")

    mylog = gene_catalog + ".log"
    workflow.add_task(
        'metawibele_extract_cluster -c [depends[0]] -o [targets[0]] >[args[0]] 2>&1 ',
        depends=[myclust,
                 TrackedExecutable("metawibele_extract_cluster")],
        targets=[gene_catalog],
        args=[mylog],
        cores=1,
        name="extract_cluster_CD-hit")

    mylog = gene_catalog_prot + ".log"
    workflow.add_task(
        'metawibele_extract_non_redundance_seq -r [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1 ',
        depends=[
            gene_catalog_nuc, complete_protein,
            TrackedExecutable("metawibele_extract_non_redundance_seq")
        ],
        targets=[gene_catalog_prot],
        args=[mylog],
        cores=1,
        name="extract_non_redundance_seq")

    ### get the abundance of gene catalog
    # run gene-abundance workflow
    mylog = gene_catalog_saf + ".log"
    workflow.add_task(
        'metawibele_gene_abundance_indexRef -r [depends[0]] -t gene -b [args[0]] -o [targets[0]] >[args[1]] 2>&1 ',
        depends=[
            gene_catalog_nuc,
            TrackedExecutable("metawibele_gene_abundance_indexRef")
        ],
        targets=[gene_catalog_saf],
        args=[prefix_gene_catalog, mylog],
        cores=1,
        name="gene_abundance_indexRef")

    ## collect sequences
    if extension_paired:
        extension_paireds = extension_paired.split(",")
        sample_files = utilities.find_files(input_dir, extension_paireds[0],
                                            None)
        samples = utilities.sample_names(sample_files, extension_paireds[0],
                                         None)
    else:
        sample_files = utilities.find_files(input_dir, extension, None)
        samples = utilities.sample_names(sample_files, extension, None)

    ## bowtie2 will map reads to gene categories
    flt_seqs = []
    for sample in samples:
        seq_file = "NA"
        if extension_paired:
            tmp = extension_paired.split(",")
        else:
            if extension != "none":
                tmp = extension.split(",")
        for item in tmp:
            if seq_file == "NA":
                seq_file = os.path.join(input_dir, sample + '%s' % item)
            else:
                seq_file = seq_file + "," + os.path.join(
                    input_dir, sample + '%s' % item)
        flt_seqs.append((sample, seq_file))
    # foreah sample

    ## Now run bowtie2 to map reads to gene categories
    mappings = []
    mappings_tmp = []
    #mem_equation = "2*12*1024 if file_size('[depends[0]]') < 10 else 4*12*1024"
    #time_equation = "2*60 if file_size('[depends[0]]') < 10 else 2*2*60"
    for (sample, seq_file) in flt_seqs:
        seq_base = sample
        mydir = os.path.join(mapping_dir, sample)
        os.system("mkdir -p " + mydir)
        sample_counts = os.path.join(mydir, seq_base + ".sort.bed")
        stdout_log = os.path.join(mydir, '%s.mapping.stdout.log' % seq_base)
        mappings_tmp.append(sample_counts)

        workflow.add_task(
            'metawibele_gene_abundance -r [depends[0]] -u [args[0]] -t [args[1]] -s [args[2]] -w [args[3]] '
            '> [args[4]] 2>&1 ',
            depends=[
                gene_catalog_nuc, gene_catalog_saf,
                TrackedExecutable("metawibele_gene_abundance")
            ],
            targets=[sample_counts],
            args=[seq_file, threads, seq_base, mydir, stdout_log],
            cores=1,
            name=sample + "__gene_abundance")

    for myfile in mappings_tmp:
        myname = os.path.basename(myfile)
        myfile_new = os.path.join(mapping_dir, myname)
        mappings.append(myfile_new)
        workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                          depends=[myfile],
                          targets=[myfile_new],
                          cores=1,
                          name="ln__" + myname)

    # collect abundance
    mylog = gene_catalog_count + ".log"
    workflow.add_task(
        'metawibele_gene_catalog_abundance -p [args[0]] -s sort.bed -c [args[1]] -o [targets[0]] >[args[2]] 2>&1 ',
        depends=utilities.add_to_list(
            mappings, TrackedExecutable("metawibele_gene_catalog_abundance")),
        targets=[gene_catalog_count],
        args=[mapping_dir, gene_catalog, mylog],
        cores=1,
        name="gene_catalog_abundance")

    return gene_catalog, gene_catalog_count
Esempio n. 14
0
def demultiplex(workflow, input_files, extension, output_folder, barcode_file, index_files, min_phred, pair_identifier):
    """Demultiplex the files (single end or paired)
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        input_files (list): A list of paths to fastq files for input to ea-utils.
        extension (string): The extension for all files.
        output_folder (string): The path of the output folder.
        barcode_file (string): A file of barcodes.
        index_files (string): A list of paths to the index files.
        min_phred (int): The min phred quality score to use in the demultiplex command.
        pair_identifier (string): The string in the file basename to identify
            the first pair in the set.
        
    Requires:
        ea-utils fastq-multx: A tool to demultiplex fastq files.
        
    Returns:
        list: A list of the demultiplexed files
        string: output folder of demultiplexed files
        
    """
    
    # error if there is more than one index file
    if len(index_files) > 1:
        sys.exit("ERROR: Only one index file expected for demultiplexing step.")
    
    # read the barcode file to get the expected output files 
    try:
        file_handle=open(barcode_file)
        lines=file_handle.readlines()
        file_handle.close()
    except EnvironmentError:
        sys.exit("ERROR: Unable to read barcode file: " + barcode_file)
        
    samples=set()
    for line in lines:
        # ignore headers or comment lines
        if not line.startswith("#"):
            sample_name=line.rstrip().split("\t")[0]
            if sample_name:
                samples.add(sample_name)
            
    # get the names of the expected output files
    demultiplex_fastq_files = utilities.name_files(samples,output_folder,subfolder="demultiplex",extension="fastq")
    
    # name the barcode file with the reverse complement barcodes added
    expanded_barcode_file = utilities.name_files("expanded_barcode_file.txt",output_folder,subfolder="demultiplex",create_folder=True)
    
    # create a file that includes the reverse complements of the barcodes
    workflow.add_task(
        "reverse_compliment_barcodes.py --input [depends[0]] --output [targets[0]]",
        depends=barcode_file,
        targets=expanded_barcode_file)
    
    # check for paired input files
    input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier)
    
    # capture the demultiplex stats in output files, one for each set of input files
    if input_pair1:
        demultiplex_log = utilities.name_files(input_pair1[0],output_folder,subfolder="demultiplex",extension="log")
    else:
        demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log")
        
    # get the output folder for all files
    demultiplex_output_folder = os.path.dirname(demultiplex_log)
    
    # get the basenames of the output files, one for each sample
    demultiplex_output_basenames = utilities.name_files(samples,output_folder,subfolder="demultiplex")
    
    # create a tracked executable
    fastq_multx_tracked = TrackedExecutable("fastq-multx",version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`")
    
    if input_pair1 and input_pair2:
        # this run has paired input files
        # get the second pair identifier
        pair_identifier2=pair_identifier.replace("1","2",1)
        # get the names of the expected output files
        demultiplex_fastq_files_R1 = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames]
        demultiplex_fastq_files_R2 = [file+pair_identifier2+".fastq" for file in demultiplex_output_basenames]
        demultiplex_fastq_files = demultiplex_fastq_files_R1+demultiplex_fastq_files_R2
        
        if index_files:
            # this run has index files
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] [depends[3]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, index_files[0], input_pair1[0], input_pair2[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2],
                targets=demultiplex_log,
                name="demultiplex")
            
        else:
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, input_pair1[0], input_pair2[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2],
                targets=demultiplex_log,
                name="demultiplex")
        
    else:
        # this run has single end input files
        # get the names of the expected output files
        demultiplex_fastq_files = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames]
        
        if index_files:
            # this run has index files
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, index_files[0], input_files[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier],
                targets=demultiplex_log,
                name="demultiplex")
            
        else:
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] -o [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, input_files[0]],
                args=[min_phred, demultiplex_output_folder, pair_identifier, fastq_multx_tracked],
                targets=demultiplex_log,
                name="demultiplex")

    demultiplex_fastq_files = demultiplex_check(workflow, demultiplex_log, demultiplex_fastq_files)


    return demultiplex_fastq_files, demultiplex_output_folder
Esempio n. 15
0
def gene_calling(workflow, assembly_dir, assembly_extentsion, input_dir,
                 extension, extension_paired, gene_call_type, prokka_dir,
                 prodigal_dir, threads, gene_file, gene_PC_file, protein_file,
                 protein_sort, gene_info, complete_gene, complete_protein):
    """
	This set of tasks will run gene-calling workflow.

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		assembly_dir: The direcory path of assembly results.
		sample_file: The sample list file.
		prokka_dir: The direcory path of prokka results.
		prodigal_dir: The direcory path of prodigal results.
		gene_file: The fasta file of gene nucleotide sequences.
		gene_PC_file: The fasta file of protein coding gene nucleotide sequences.
		protein_file: The fasta file of protein sequences.
		protein_sort: The sorted fasta file of protein sequences.
		gene_info: The summaized gene calling file.
		complete_gene: The fasta file of gene nucleotide sequences for complete ORFs.
		complete_protein: The fasta file of protein sequences for complete ORFs.

	Requires:
		prokka 1.14-dev: rapid prokaryotic genome annotation (recommend to close '-c' parameter in prodigal)
		prodigal v2.6: gene prediction
		usearch (tested with usearch v9.0.2132_i86linux64)
		assembled contig files

	Returns:
		string: name of gene files

	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# add gene calling tasks
		mygene, myprotein = preprocessing_tasks.gene_calling (workflow, assembly_dir, args.sample_file,
															  prokka_dir, prodigal_dir,
															  gene_file, gene_PC_file, protein_file, protein_sort,
															  gene_info, complete_gene, complete_protein)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start gene_calling module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    # ================================================
    # collect sequences
    # ================================================
    if extension_paired:
        extension_paireds = extension_paired.split(",")
        sample_files = utilities.find_files(input_dir, extension_paireds[0],
                                            None)
        samples = utilities.sample_names(sample_files, extension_paireds[0],
                                         None)
    else:
        sample_files = utilities.find_files(input_dir, extension, None)
        samples = utilities.sample_names(sample_files, extension, None)
    sequence_files = []
    for mysample in samples:
        myfile = os.path.join(assembly_dir, mysample,
                              mysample + "%s" % assembly_extentsion)
        sequence_files.append(myfile)
    # foreach sample

    filtered_contigs = sequence_files

    # ================================================
    # Gene calling
    # ================================================
    fna_file = []
    faa_file = []
    gff_files = []
    fna_file_tmp = []
    faa_file_tmp = []
    gff_files_tmp = []

    ## Using Prodigal
    if gene_call_type == "prodigal" or gene_call_type == "both":
        os.system("mkdir -p " + prodigal_dir)
        for contig in filtered_contigs:
            contig_base = os.path.basename(contig).split(os.extsep)[0]
            annotation_dir = os.path.join(prodigal_dir, contig_base)
            os.system("mkdir -p " + annotation_dir)
            gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base)
            cds_file = os.path.join(annotation_dir, '%s.fna' % contig_base)
            cds_aa = os.path.join(annotation_dir, '%s.faa' % contig_base)
            score = os.path.join(annotation_dir,
                                 '%s.gene_score.txt' % contig_base)
            stdout_log = os.path.join(annotation_dir,
                                      '%s.stdout.log' % contig_base)
            faa_file_tmp.append(cds_aa)

            workflow.add_task_gridable(
                'prodigal -m -p meta -i [depends[0]] '
                '-f gff -o [targets[0]] -d [targets[1]] -s [targets[3]] '
                '-a [targets[2]] '
                '>[args[0]] 2>&1',
                depends=[contig, TrackedExecutable("prodigal")],
                targets=[gff_file, cds_file, cds_aa, score],
                args=[stdout_log],
                cores=threads,
                mem=mem_equation,
                time=time_equation,
                name=contig_base + "__prodigal")

        for myfile in faa_file_tmp:
            myname = os.path.basename(myfile)
            myfile_new = os.path.join(prodigal_dir, myname)
            faa_file.append(myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfile],
                              targets=[myfile_new],
                              cores=1,
                              name="ln__" + myname)
            myfna = re.sub(".faa", ".fna", myfile)
            myfna_new = re.sub(".faa", ".fna", myfile_new)
            if gene_call_type == "prodigal":
                fna_file.append(myfna_new)
                mygff_new = re.sub(".faa", ".gff", myfile_new)
                gff_files.append(mygff_new)
                prokka_dir = prodigal_dir
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfna],
                              targets=[myfna_new],
                              cores=1,
                              name="ln__" + os.path.basename(myfna))
            mygff = re.sub(".faa", ".gff", myfile)
            mygff_new = re.sub(".faa", ".gff", myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[mygff],
                              targets=[mygff_new],
                              cores=1,
                              name="ln__" + os.path.basename(mygff))

    if gene_call_type == "prokka" or gene_call_type == "both":
        ## Calling genes with Prokka
        os.system("mkdir -p " + prokka_dir)
        for contig in filtered_contigs:
            contig_base = os.path.basename(contig).split(os.extsep)[0]
            sample = os.path.basename(contig_base)
            annotation_dir = os.path.join(prokka_dir, sample)
            os.system("mkdir -p " + annotation_dir)
            stdout_log = os.path.join(
                annotation_dir, '%s.prokka.bacteria.stdout.log' % contig_base)
            score = os.path.join(annotation_dir,
                                 '%s.gene_score.txt' % contig_base)
            gene_nuc = os.path.join(annotation_dir, '%s.ffn' % contig_base)
            gene_aa = os.path.join(annotation_dir, '%s.faa' % contig_base)
            gff_file = os.path.join(annotation_dir, '%s.gff' % contig_base)
            fna_file_tmp.append(gene_nuc)
            gff_files_tmp.append(gff_file)

            workflow.add_task_gridable(
                'prokka --prefix [args[0]] --addgenes --addmrna --force --metagenome '
                '--cpus [args[2]] '
                '--outdir [args[1]] [depends[0]] '
                '>[args[3]] 2>&1 ',
                depends=[contig, TrackedExecutable("prokka")],
                targets=[gene_nuc, gene_aa, gff_file],
                args=[sample, annotation_dir, threads, stdout_log],
                cores=threads,
                mem=mem_equation,
                time=time_equation,
                name=contig_base + "__prokka")

        for myfile in gff_files_tmp:
            myname = os.path.basename(myfile)
            myfile_new = os.path.join(prokka_dir, myname)
            gff_files.append(myfile_new)
        for myfile in fna_file_tmp:
            myname = os.path.basename(myfile)
            myfile_new = os.path.join(prokka_dir, myname)
            fna_file.append(myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfile],
                              targets=[myfile_new],
                              cores=1,
                              name="ln__" + myname)
            myfaa = re.sub(".ffn", ".faa", myfile)
            myfaa_new = re.sub(".ffn", ".faa", myfile_new)
            if gene_call_type == "prokka":
                faa_file.append(myfaa_new)
                prodigal_dir = prokka_dir
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[myfaa],
                              targets=[myfaa_new],
                              cores=1,
                              name="ln__" + os.path.basename(myfaa))
            mygff = re.sub(".ffn", ".gff", myfile)
            mygff_new = re.sub(".ffn", ".gff", myfile_new)
            workflow.add_task("ln -fs [depends[0]] [targets[0]]",
                              depends=[mygff],
                              targets=[mygff_new],
                              cores=1,
                              name="ln__" + os.path.basename(mygff))

    # ================================================
    # Summarize sequences
    # ================================================
    #mem_equation = "50000"
    ### combine gene sequences ###
    nuc_type = "ffn"
    if gene_call_type == "prodigal":
        nuc_type = "fna"
    mylog = re.sub(".fna", ".log", gene_file)
    workflow.add_task(
        'metawibele_combine_gene_sequences -p [args[0]] -e [args[1]] -o [targets[0]] > [args[2]] 2>&1 ',
        depends=utilities.add_to_list(
            fna_file, TrackedExecutable("metawibele_combine_gene_sequences")) +
        fna_file_tmp + gff_files + gff_files_tmp,
        targets=[gene_file],
        args=[prokka_dir, nuc_type, mylog],
        cores=1,
        name="combine_gene_sequences")

    ### combine protein sequences ###
    ## collect sequences
    mylog = re.sub(".faa", ".log", protein_file)
    workflow.add_task(
        'metawibele_format_protein_sequences -p [args[0]] -q [args[1]] -e faa -o [targets[0]] '
        '-m [targets[1]] >[args[2]] 2>&1 ',
        depends=utilities.add_to_list(
            faa_file, TrackedExecutable("metawibele_format_protein_sequences"))
        + faa_file_tmp + gff_files + gff_files_tmp,
        targets=[protein_file, gene_info],
        args=[prokka_dir, prodigal_dir, mylog],
        cores=1,
        name="format_protein_sequences")

    ## sort by length and filter out short-length sequence
    mylog = re.sub(".faa", ".log", protein_sort)
    workflow.add_task(
        'usearch -sortbylength [depends[0]] '
        '-fastaout [targets[0]] -minseqlength 0 >[args[0]] 2>&1 ',
        depends=[protein_file, TrackedExecutable("usearch")],
        targets=[protein_sort],
        args=[mylog],
        cores=1,
        name="usearch__sorting")

    ## extract nucleotide sequence for protein coding genes
    mylog = re.sub(".fna", ".log", gene_PC_file)
    workflow.add_task(
        'metawibele_extract_protein_coding_genes -g [depends[0]] -p [depends[1]] -o [targets[0]] > [args[0]] 2>&1 ',
        depends=[
            gene_file, protein_sort,
            TrackedExecutable("metawibele_extract_protein_coding_genes")
        ],
        targets=[gene_PC_file],
        args=[mylog],
        cores=1,
        name="extract_protein_coding_genes")

    ## extract sequences
    mylog = re.sub(".fna", ".log", complete_gene)
    workflow.add_task(
        'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1',
        depends=[
            gene_info, gene_PC_file,
            TrackedExecutable("metawibele_extract_complete_ORF_seq")
        ],
        targets=[complete_gene],
        args=[mylog],
        cores=1,
        name='extract_complete_ORF_seq')

    mylog = re.sub(".faa", ".log", complete_protein)
    workflow.add_task(
        'metawibele_extract_complete_ORF_seq -t complete -m [depends[0]] -i [depends[1]] -o [targets[0]] >[args[0]] 2>&1',
        depends=[
            gene_info, protein_sort,
            TrackedExecutable("metawibele_extract_complete_ORF_seq")
        ],
        targets=[complete_protein],
        args=[mylog],
        cores=1,
        name='extract_complete_ORF_seq')

    return complete_gene, complete_protein
Esempio n. 16
0
def mandatory_prioritization(workflow, prioritization_conf, protein_family_ann,
                             protein_family_attr, output_folder):
    """
	This set of tasks will run prioritization using quantitative criteria.

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		prioritization_conf: Configuration file for quantitative prioritization.
		protein_family_ann: Finalized annotation file for protein .
		protein_family_attr: Finalized attribue file for annotations.

	Requires:
		config file
		annotation files

	Returns:
		string: the name of prioritized file.

	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# add quantification_based_prioritization tasks
		myrank, mypriority = prioritization.mandatory_prioritization (workflow, args.prioritization_conf,
		                                                                        protein_family_ann, protein_family_attr,
		                                                                        output_dir)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start mandatory_prioritization module ######")

    # get the clustering output files
    priority_dir = output_folder
    unsupervised_rank = os.path.join(
        priority_dir,
        config.basename + "_unsupervised_prioritization.rank.tsv")
    supervised_rank = os.path.join(
        priority_dir, config.basename + "_supervised_prioritization.rank.tsv")
    #unsupervised_priority = os.path.join(priority_dir, config.basename + "_unsupervised_prioritization.priority.tsv")
    #supervised_priority = os.path.join(priority_dir, config.basename + "_supervised_prioritization.priority.tsv")
    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    if not os.path.exists(priority_dir):
        os.system("mkdir -p " + priority_dir)

    # run unsupervised prioritization
    mylog = re.sub(".tsv", ".log", unsupervised_rank)
    workflow.add_task(
        "metawibele_quantify_prioritization -c [depends[0]] -m unsupervised -w fixed -a [depends[1]] -b [depends[2]] -o [args[0]] >[args[1]] 2>&1",
        depends=[
            prioritization_conf, protein_family_ann, protein_family_attr,
            TrackedExecutable("metawibele_quantify_prioritization")
        ],
        targets=[unsupervised_rank],
        args=[priority_dir, mylog],
        cores=1,
        name="quantify_prioritization__unsupervised")

    # run supervised prioritization
    if not "".join(config.phenotype) == "none":
        mylog = re.sub(".tsv", ".log", supervised_rank)
        workflow.add_task(
            "metawibele_quantify_prioritization -c [depends[0]] -m supervised -w equal -a [depends[1]] -b [depends[2]] -o [args[0]] >[args[1]] 2>&1",
            depends=[
                prioritization_conf, protein_family_ann, protein_family_attr,
                TrackedExecutable("metawibele_quantify_prioritization")
            ],
            targets=[supervised_rank],
            args=[priority_dir, mylog],
            cores=1,
            name="quantify_prioritization__supervised")

    return unsupervised_rank, supervised_rank
Esempio n. 17
0
def demultiplex_dual(workflow, output_folder, input_files, extension,
            barcode_files, dual_barcode_path, min_phred, pair_identifier):

    """Demultiplex the files (dual indexed paired)

        Args:
            workflow (anadama2.workflow): An instance of the workflow class.
            input_files (list): A list of paths to fastq(gz) files for input to ea-utils.
            extension (string): The extension for all files.
            output_folder (string): The path of the output folder.
            barcode_files (list): A list of barcode files.
            dual_index_path (string): A paths to the dual index file.
            min_phred (int): The min phred quality score to use in the demultiplex command.
            pair_identifier (string): The string in the file basename to identify
                the first pair in the set.

        Requires:
            ea-utils fastq-multx: A tool to demultiplex fastq files.

        Returns:
            list: A list of the demultiplexed files
            string: output folder of demultiplexed files

        """

    # capture the demultiplex stats in log file, one for each set of input files
    demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log",create_folder=True)
    demultiplex_output_folder = os.path.dirname(demultiplex_log)

    # create a tracked executable
    fastq_multx_tracked = TrackedExecutable("fastq-multx",
                                            version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`")

    # check for paired input files
    input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier)

    # get barcode files
    barcode1, barcode2 = utilities.paired_files(barcode_files, extension, pair_identifier)

    # get the second pair identifier
    pair_identifier2 = pair_identifier.replace("1", "2", 1)

    try:
        file_handle = open(dual_barcode_path)
        lines = file_handle.readlines()
        file_handle.close()
    except EnvironmentError:
        sys.exit("ERROR: Unable to read dual barcode file: " + dual_barcode_path)

    run_name = os.path.basename(input_pair1[0]).replace(pair_identifier, "").replace("." + extension, "")
    demultiplex_files = set()
    for line in lines:
        # ignore headers or comment lines
        if not line.startswith("#"):
            sample_name = line.split("\t")[0]

            if sample_name:
                nm1 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier + "." + extension
                nm2 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier2 + "." + extension
                demultiplex_files.add(nm1)
                demultiplex_files.add(nm2)

    # get the names of the expected output files
    # demultiplex_files = utilities.name_files(samples, demultiplex_output_folder, extension=extension)

    workflow.add_task(
        "fastq-multx -B [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]]\
         -o n/a -o n/a -o [args[0]]/[args[5]]_%[args[3]].[args[1]] -o [args[0]]/[args[5]]_%[args[4]].[args[1]]\
         -q [args[2]] > [targets[0]]",
        depends=[dual_barcode_path, barcode1[0], barcode2[0], input_pair1[0], input_pair2[0]],
        args=[demultiplex_output_folder, extension, min_phred, pair_identifier, pair_identifier2, run_name, fastq_multx_tracked],
        targets=[demultiplex_log, TrackedDirectory(demultiplex_output_folder)],
        name="demultiplex_dual")

    demultiplex_files = demultiplex_check(workflow, demultiplex_log, demultiplex_files)


    return demultiplex_files, demultiplex_output_folder
Esempio n. 18
0
def finalize_prioritization(workflow, unsupervised_rank,
                            selected_unsup_priority, supervised_rank,
                            selected_priority, output_folder,
                            final_unsupervised_rank,
                            final_selected_unsup_priority,
                            final_supervised_rank, final_selected_priority):
    """
	This set of tasks will format prioritization files

	Args:
		workflow (anadama2.workflow): An instance of the workflow class.
		raw prioritized files
		finalized prioritized files

	Requires:
		raw prioritized files


	Example:
		from anadama2 import Workflow
		from MetaWIBELE.characterize import characterization

		# create an anadama2 workflow instance
		workflow=Workflow()

		# add quality control tasks for the fastq files
		finalize_prioritization (workflow,
                             unsupervised_rank,
                             supervised_rank, selected_priority,
                             output_folder,
                             final_unsupervised_rank,
                             final_supervised_rank, final_selected_priority)
		# run the workflow
		workflow.go()
	"""

    config.logger.info("###### Start finalize_prioritization module ######")

    time_equation = config.time  # xxx hours defined in global config
    mem_equation = config.memory  # xxx GB defined in global config

    priority_dir = output_folder
    if not os.path.exists(priority_dir):
        os.system("mkdir -p " + priority_dir)

    # format prioritization
    mylog = re.sub(".tsv", ".log", final_unsupervised_rank)
    workflow.add_task(
        "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1",
        depends=[
            unsupervised_rank,
            TrackedExecutable("metawibele_finalize_prioritization")
        ],
        targets=[final_unsupervised_rank],
        args=[mylog],
        cores=1,
        name="finalize_prioritization__unsupervised_rank")
    mylog = re.sub(".tsv", ".log", final_selected_unsup_priority)
    workflow.add_task(
        "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1",
        depends=[
            selected_unsup_priority,
            TrackedExecutable("metawibele_finalize_prioritization")
        ],
        targets=[final_selected_unsup_priority],
        args=[mylog],
        cores=1,
        name="finalize_prioritization__selected_unsupervised_priority")

    if not "".join(config.phenotype) == "none":
        mylog = re.sub(".tsv", ".log", final_supervised_rank)
        workflow.add_task(
            "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1",
            depends=[
                supervised_rank,
                TrackedExecutable("metawibele_finalize_prioritization")
            ],
            targets=[final_supervised_rank],
            args=[mylog],
            cores=1,
            name="finalize_prioritization__supervised_rank")

        mylog = re.sub(".tsv", ".log", final_selected_priority)
        workflow.add_task(
            "metawibele_finalize_prioritization -i [depends[0]] -o [targets[0]] > [args[0]] 2>&1",
            depends=[
                selected_priority,
                TrackedExecutable("metawibele_finalize_prioritization")
            ],
            targets=[final_selected_priority],
            args=[mylog],
            cores=1,
            name="finalize_prioritization__selected_supervised_priority")
Esempio n. 19
0
def merge_pairs_and_rename(workflow, method, input_files, extension,
                           output_folder, pair_identifier, threads,
                           fastq_ascii):
    """ Merge the files if pairs and rename sequence ids to match sample id
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis, usearch default or vsearch
        input_files (list): A list of paths to fastq files.
        extension (string): The extension for all files.
        output_folder (string): The path of the output folder.
        pair_identifier (string): The string in the file basename to identify
            the first pair in the set.
        threads (int): The number of threads for each task.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: A list of the renamed files.
        
    """

    pair1, pair2 = utilities.paired_files(input_files, extension,
                                          pair_identifier)

    if pair1 and pair2:
        # paired input files were found

        # if the files are gzipped, first decompress as fastq_mergepairs will take in fastq.gz but the output will not be correctly formatted
        if pair1[0].endswith(".gz"):
            # get the names of the decompressed output files
            decompressed_pair1 = utilities.name_files(
                [os.path.basename(file).replace(".gz", "") for file in pair1],
                output_folder,
                subfolder="merged_renamed")
            # get the names of the decompressed output files
            decompressed_pair2 = utilities.name_files(
                [os.path.basename(file).replace(".gz", "") for file in pair2],
                output_folder,
                subfolder="merged_renamed")

            # add tasks to decompress the files
            workflow.add_task_group("gunzip -c [depends[0]] > [targets[0]]",
                                    depends=pair1 + pair2,
                                    targets=decompressed_pair1 +
                                    decompressed_pair2)

            # the pair files to be used for the remaining tasks are those that are decompressed
            pair1 = decompressed_pair1
            pair2 = decompressed_pair2

        # get the sample names from the input file names
        sample_names = [
            os.path.basename(file).replace(pair_identifier + ".fastq", "")
            for file in pair1
        ]

        # get the names of the output files
        stitched_files = utilities.name_files(sample_names,
                                              output_folder,
                                              subfolder="merged_renamed",
                                              tag="stitched",
                                              extension="fastq",
                                              create_folder=True)
        unjoined_files = utilities.name_files(sample_names,
                                              output_folder,
                                              subfolder="merged_renamed",
                                              tag="unjoined",
                                              extension="fastq")

        # run usearch to merge pairs, if input files are non-empty
        for read1, read2, stitched_output, unjoined_output in zip(
                pair1, pair2, stitched_files, unjoined_files):
            if method == 'vsearch':
                workflow.add_task(
                    utilities.partial_function(merge_pairs,
                                               method="vsearch",
                                               threads=threads),
                    depends=[read1, read2,
                             TrackedExecutable("vsearch")],
                    targets=[stitched_output, unjoined_output],
                    name="vsearch_fastq_mergepairs")
            else:
                workflow.add_task(
                    utilities.partial_function(merge_pairs,
                                               method="userach",
                                               threads=threads,
                                               fastq_ascii=fastq_ascii),
                    depends=[read1, read2,
                             TrackedExecutable("usearch")],
                    targets=[stitched_output, unjoined_output],
                    name="usearch_fastq_mergepairs")

        # merge the stitched and unjoined from the prior step
        renamed_files = utilities.name_files(sample_names,
                                             output_folder,
                                             subfolder="merged_renamed",
                                             tag="renamed",
                                             extension="fastq")
        workflow.add_task_group(
            "merge_and_rename_fastq.py [depends[0]] [depends[1]] _stitched [targets[0]]",
            depends=zip(stitched_files, unjoined_files),
            targets=renamed_files)

    else:
        # these files are not pairs and do not need to be merged
        # rename the files
        renamed_files = utilities.name_files(input_files,
                                             output_folder,
                                             subfolder="merged_renamed",
                                             tag="renamed",
                                             extension="fastq",
                                             create_folder=True)
        workflow.add_task_group(
            "merge_and_rename_fastq.py [depends[0]] '' '' [targets[0]]",
            depends=input_files,
            targets=renamed_files)

    return renamed_files
        utilities.name_files([
            name + ".trimmed.1.fastq", name + ".trimmed.2.fastq",
            name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq",
            name + ".trimmed.single.12.fastq"
        ],
                             args.output,
                             subfolder="kneaddata",
                             create_folder=True) for name in sample_names
    ]
    paired = True
    for target_set, input_R1, input_R2, name in zip(qc_targets, input_pair1,
                                                    input_pair2, sample_names):
        workflow.add_task(
            "kneaddata --run-fastqc-start --input [depends[0]] --input [depends[1]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]] && cat [args[3]] [args[4]] > [targets[2]]",
            depends=[input_R1, input_R2,
                     TrackedExecutable("kneaddata")],
            targets=[target_set[0], target_set[1], target_set[4]],
            args=[
                os.path.dirname(target_set[0]), args.threads, name,
                target_set[2], target_set[3]
            ])
else:
    qc_targets = utilities.name_files(sample_names,
                                      args.output,
                                      subfolder="kneaddata",
                                      create_folder=True,
                                      extension="trimmed.fastq")
    for target_file, input_file, name in zip(qc_targets, input_files,
                                             sample_names):
        workflow.add_task(
            "kneaddata --run-fastqc-start --input [depends[0]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]]",
Esempio n. 21
0
# Parsing the workflow arguments
args = workflow.parse_args()

#Loading the config setting
args.config = 'etc/config.ini'

# AnADAMA2 example workflow.do
workflow.do("ls /usr/bin/ | sort > [t:output/global_exe.txt]")  #Command
workflow.do("ls $HOME/.local/bin/ | sort > [t:output/local_exe.txt]")  #Command

# Task0 sample python analysis module  - src/trim.py
workflow.add_task(
    "src/trim.py --lines [args[0]] --output [targets[0]] --input " +
    args.input,  #Command 
    depends=[TrackedExecutable("src/trim.py")
             ],  #Tracking executable dependencies
    targets=args.output,  #Output target directory
    args=[args.lines])  #Additional arguments

# Task1 sample python visualization module - src/plot.py
workflow.add_task(
    "src/plot.py --output [targets[0]] --input " + args.input,  #Command 
    depends=[TrackedExecutable("src/plot.py")
             ],  #Tracking executable dependencies
    targets=args.output)  #Output target directory

# Task2 sample R module  - src/analysis_example.r
workflow.add_task(
    "src/analysis.R -o [targets[0]] -d " + args.metadata,  #Command 
    depends=[TrackedExecutable("src/analysis.R")
Esempio n. 22
0
def remove_primers(workflow,fwd_primer,rev_primer,input_folder,output_folder,pair_id,threads):
    """ Identifies primers and N filters samples
       Args:
           workflow (anadama2.workflow): an instance of the workflow class
           input_folder (string): path to input folder
           output_folder (string):  path to output folder
           fwd_primer (string): forward primer
           rev_primer (string): reverse primer
           pair_id (string): pair identifier
           threads (string): number of threads

       Requires:
          dada2, Biostrings, ShortRead, tools r packages

       Returns:
           string: path to folder with primers removed files
    """
    script_path = utilities.get_package_file("identify_primers", "Rscript")
    filtN_folder = os.path.join(output_folder,"filtN")
    primers_folder = os.path.join(output_folder,"primers")
    fwd_primer_file = os.path.join(primers_folder,"fwd_primer_file.txt")
    rev_primer_file = os.path.join(primers_folder,"rev_primer_file.txt")
    cutadapt_folder = os.path.join(output_folder, "cutadapt")

    # run identify primers task
    workflow.add_task(
        "[vars[0]]  \
          --input_dir=[args[3]] \
          --filtn_dir=[vars[1]] \
          --primers_dir=[vars[2]] \
          --threads=[args[4]] \
          --fwd_primer_file=[targets[0]] \
          --rev_primer_file=[targets[1]] \
          --fwd_primer=[args[0]] \
          --rev_primer=[args[1]] \
          --pair_id=[args[2]]",
        targets=[fwd_primer_file,rev_primer_file,
                 TrackedDirectory(filtN_folder)],
        args=[fwd_primer, rev_primer, pair_id,input_folder,threads],
        vars=[script_path,filtN_folder,primers_folder,output_folder],
        name="identify_primers"
    )

    pair_id2 = pair_id.replace("1", "2",1)
    fwd_files = sorted(fnmatch.filter(os.listdir(input_folder), "*"+pair_id+"*.fastq*"))
    rev_files = sorted(fnmatch.filter(os.listdir(input_folder), "*" + pair_id2 + "*.fastq*"))

    #run cutadapt to remove primers
    for i in range(0,len(fwd_files)):
        fwd_file=os.path.join(input_folder,fwd_files[i])
        rev_file = os.path.join(input_folder, rev_files[i])
        workflow.add_task(
            cutadapt_do,
            depends=[fwd_primer_file,
                     rev_primer_file,
                     fwd_file,
                     rev_file,
                     TrackedDirectory(filtN_folder),
                     TrackedExecutable("cutadapt",version_command="echo 'cutadapt' `cutadapt --version`")],
            targets=[TrackedDirectory(cutadapt_folder)],
            name="remove_primers"
        )

    return cutadapt_folder