Beispiel #1
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.
    
    Args: 
        None
    Requires:
        None
    Returns:
        AnaDAMA2.Workflow: The workflow object for this pipeline.
        AnaDAMA2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='0.1',
                        description='A workflow to handle HMP2 '
                        'Proteomics data.',
                        remove_options=['input', 'output'])
    workflow.add_argument('manifest-file',
                          desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('config-file',
                          desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('checksums-file',
                          desc='MD5 checksums for files '
                          'found in the supplied input directory.')
    workflow.add_argument(
        'data_specific_metadata',
        desc='A collection of '
        'dataset specific metadata that should be integrated '
        'with any analysis output (creating a PCL file).')

    return workflow
Beispiel #2
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        anadama2.Workflow: The workflow object for this pipeline
        anadama2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='0.1', description='A workflow to handle '
                        'uploading metadata and data files to the Data '
                        'Coordination Center (DCC)', 
                        remove_options=['input', 'output'])
    workflow.add_argument('manifest-file', desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('metadata-file', desc='Accompanying metadata file '
                           'for the provided data files.', default=None)
    workflow.add_argument('baseline-metadata-file', desc='Metadata file '
                          'containing baseline visit metadata per subject.')                           
    workflow.add_argument('config-file', desc='Configuration file '
                          'containing parameters required by the workflow.')

    return workflow
Beispiel #3
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        AnaDAMA2.Workflow: The workflow object for this pipeline.
        AnaDAMA2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='0.1',
                        description='A workflow to handle HMP2 '
                        'host exome data.',
                        remove_options=['input', 'output'])
    workflow.add_argument('manifest-file',
                          desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('config-file',
                          desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('threads',
                          desc='Number of threads to use in '
                          'workflow processing',
                          default=1)

    return workflow
Beispiel #4
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        anadama2.Workflow: The workflow object for this pipeline
        anadama2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='1.0',
                        description='A workflow to handle visualization '
                        'of HMP2 16S data.')
    workflow.add_argument('config-file',
                          desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('metadata-file',
                          desc='Accompanying metadata file '
                          'for the provided data files.',
                          default=None)
    workflow.add_argument('source',
                          desc='The source of the output files generated. '
                          '[biobakery, CMMR]',
                          default='biobakery',
                          choices=['biobakery', 'CMMR'])

    return workflow
Beispiel #5
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        anadama2.Workflow: The workflow object for this pipeline
        anadama2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='1.0',
                        description='A workflow to handle visualization '
                        'of HMP2 Metaviromics data.')
    workflow.add_argument('config-file',
                          desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('metadata-file',
                          desc='Accompanying metadata file '
                          'for the provided data files.',
                          default=None)

    return workflow
Beispiel #6
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        anadama2.Workflow: The workflow object for this pipeline
        anadama2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='0.1', description='A workflow to handle '
                        'analysis of 16S amplicon data.', 
                        remove_options=['input', 'output'])
    workflow.add_argument('manifest-file', desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('config-file', desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('metadata-file', desc='Accompanying metadata file '
                           'for the provided data files.', default=None)
    workflow.add_argument('threads', desc='number of threads/cores for each '
                          'task to use', default=1)

    return workflow
Beispiel #7
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.
    
    Args: 
        None
    Requires:
        None
    Returns:
        AnaDAMA2.Workflow: The workflow object for this pipeline.
        AnaDAMA2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='0.1', description='A workflow to handle HMP2 '
                        'Metabolomic data.', remove_options=['input', 'output'])
    workflow.add_argument('manifest-file', desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('config-file', desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('metadata-file', desc='Accompanying metadata file '
                           'for the provided data files.', default=None)
    workflow.add_argument('aux_metadata', desc='Any additional metadata '
                          'files that can supply metadata for our ouptut '
                          'PCL files.')                           

    return workflow
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        anadama2.Workflow: The workflow object for this pipeline
        anadama2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='0.1', description='A workflow to assemble '
                        'metagenomic data and run a gene caller on the '
                        'resulting contigs.')
    workflow.add_argument('contaminant-db', desc='KneadData DNA contaminants database.')
    workflow.add_argument('file-extension', desc='Extension of input files to '
                          'assemble and gene call on.', default='.fastq.gz')
    workflow.add_argument('threads', desc='number of threads/cores for each '
                          'task to use', default=1)
    workflow.add_argument('memory', desc='The amount of memory to use for each '
                          'assembly job. Provided in GB', default='10240')


    return workflow
Beispiel #9
0
    if new_sum.lower() == md5sum.lower():
        file_handle = open(task.targets[0].name, "w")
        file_handle.write("Match")
        file_handle.close()
    else:
        error_msg = "ERROR: Sums do not match for file {0}\nComputed Sum: {1}\nExpected Sum: {2}".format(
            task.depends[1].name, new_sum, md5sum)
        sys.stderr.write(error_msg)
        raise Exception(error_msg)


# create a workflow and get the arguments
workflow = Workflow()
workflow.add_argument("input-metadata",
                      desc="the metadata file",
                      required=True)
workflow.add_argument("input-extension",
                      desc="the input file extension",
                      required=True)
args = workflow.parse_args()

# get all of the input files
input_files = utilities.find_files(args.input,
                                   extension=args.input_extension,
                                   exit_if_not_found=True)
sample_names = utilities.sample_names(input_files, args.input_extension)

# for each raw input file, generate an md5sum file
md5sum_outputs = [
    os.path.join(args.output, output_file_name) + ".md5sum"
from anadama2 import Workflow

from biobakery_workflows.tasks import dadatwo

workflow = Workflow()
workflow.add_argument("fwd-primer", desc="forward primer, required for its workflow",required=True)
workflow.add_argument("rev-primer", desc="reverse primer, required for its workflow",required=True)
workflow.add_argument("pair-identifier", desc="the string to identify the first file in a pair", default="_R1_001")
workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=1)
args = workflow.parse_args()

dadatwo.remove_primers(workflow,args.fwd_primer,args.rev_primer,args.input,args.output,args.pair_identifier,args.threads)

workflow.go()
# import the library of biobakery_workflow tasks for shotgun sequences
from biobakery_workflows.tasks import shotgun, general

# import the utilities functions and config settings from biobakery_workflows
from biobakery_workflows import utilities, config

# create a workflow instance, providing the version number and description
# the version number will appear when running this script with the "--version" option
# the description will appear when running this script with the "--help" option
workflow = Workflow(version="0.1",
                    description="A workflow for isolate assembly")

# add the custom arguments to the workflow
workflow_config = config.ShotGun()
workflow.add_argument("species-name", desc="the species name", required=True)
workflow.add_argument("input-extension",
                      desc="the input file extension",
                      default="fastq.gz",
                      choices=["fastq.gz", "fastq", "fq.gz", "fq"])
workflow.add_argument("threads",
                      desc="number of threads/cores for each task to use",
                      default=1)
workflow.add_argument("pair-identifier",
                      desc="the string to identify the first file in a pair",
                      default="_R1_001")
workflow.add_argument(
    "reference-database",
    desc="the path to the reference database for quality assessment",
    default="")
workflow.add_argument("dbcan-path",
Beispiel #12
0
# import the task to convert from biom to tsv
from biobakery_workflows.tasks.sixteen_s import convert_from_biom_to_tsv_list

# import the files for descriptions and paths
from biobakery_workflows import files

# create a workflow instance, providing the version number and description
workflow = Workflow(
    version="0.1",
    remove_options=["input"],
    description="A workflow for stats on wmgx and 16s data sets")

# add the custom arguments to the workflow
workflow.add_argument(
    "input",
    desc="the folder containing taxonomy and functional data files",
    required=True)

# add the custom arguments to the workflow
workflow.add_argument("project-name",
                      desc="the name of the project",
                      required=True)
workflow.add_argument("input-metadata",
                      desc="the metadata file (samples as columns or rows)",
                      required=True)
workflow.add_argument(
    "transform",
    desc=
    "the transform to apply to the data with MaAsLin2 (default is the MaAsLin2 default transform)",
    default="")
workflow.add_argument(
Beispiel #13
0
def parse_cli_arguments():
    '''
	 Parses any command-line arguments passed into the workflow.
	the version number will appear when running this script with the "--version" option
	the description will appear when running this script with the "--help" option
	create a workflow instance, providing the version number and description
	'''

    tmp_output = os.path.abspath(config.working_dir)

    workflow = Workflow(
        version=VERSION,
        description="A workflow for MetaWIBELE characterization",
        remove_options=["output"])

    # add the custom arguments to the workflow
    workflow.add_argument("threads",
                          desc="number of threads/cores for each task to use",
                          default=None)
    workflow.add_argument(
        "characterization-config",
        desc="the configuration file of characterization analysis",
        default=None)
    workflow.add_argument(
        "mspminer-config",
        desc=
        "the configuration file used by mspminer; [no] skip MSPminer-based taxonomic assignment",
        default=None)
    workflow.add_argument("bypass-clustering",
                          desc="do not cluster proteins into protein families",
                          action="store_true")
    workflow.add_argument(
        "bypass-global-homology",
        desc=
        "do not annotate protein families based on global homology information",
        action="store_true")
    workflow.add_argument(
        "bypass-domain-motif",
        desc=
        "do not annotate protein families based on domain/motif information",
        action="store_true")
    workflow.add_argument(
        "bypass-interproscan",
        desc="do not annotate protein families based on interproscan",
        action="store_true")
    workflow.add_argument(
        "bypass-pfam_to_go",
        desc="do not annotate protein families based on pfam2go",
        action="store_true")
    workflow.add_argument(
        "bypass-domine",
        desc="do not annotate protein families based on DOMINE database",
        action="store_true")
    workflow.add_argument(
        "bypass-sifts",
        desc="do not annotate protein families based on SIFTS database",
        action="store_true")
    workflow.add_argument(
        "bypass-expatlas",
        desc=
        "do not annotate protein families based on Expression Atlas database",
        action="store_true")
    workflow.add_argument(
        "bypass-psortb",
        desc="do not annotate protein families based on psortb",
        action="store_true")
    workflow.add_argument(
        "bypass-abundance",
        desc="do not annotate protein families based on abundance information",
        action="store_true")
    workflow.add_argument(
        "bypass-mspminer",
        desc="do not annotate protein families based on MSPminer",
        action="store_true")
    workflow.add_argument(
        "bypass-maaslin",
        desc="do not annotate protein families based on MaAsLin2",
        action="store_true")
    workflow.add_argument(
        "split-number",
        desc=
        "indicates number of spliting files for annotation based on sequence information",
        default=None)
    workflow.add_argument(
        "bypass-integration",
        desc="do not integrate annotations for protein families",
        action="store_true")
    workflow.add_argument("study", desc="specify the study name", default=None)
    workflow.add_argument("basename",
                          desc="specify the basename for output files",
                          default=None)
    workflow.add_argument(
        "input-sequence",
        desc=
        "input the sequence file for gene families (non-redundant catalogs)",
        required=True)
    workflow.add_argument(
        "input-count",
        desc="input the count file for gene families (non-redundant catalogs)",
        required=True)
    workflow.add_argument("input-metadata",
                          desc="input the metadata file",
                          required=True)
    workflow.add_argument(
        "output",
        desc=
        "provide an output folder which the workflow database and log is written. By default, thet be written to the anadama2 folder of users' working directory",
        default=tmp_output)

    return workflow
Beispiel #14
0
from anadama2 import Workflow

# import the library of biobakery_workflow tasks for shotgun sequences
from biobakery_workflows.tasks import shotgun, general

# import the utilities functions and config settings from biobakery_workflows
from biobakery_workflows import utilities, config

# create a workflow instance, providing the version number and description
# the version number will appear when running this script with the "--version" option
# the description will appear when running this script with the "--help" option
workflow = Workflow(version="0.1", description="A workflow to run strainphlan")

# add the custom arguments to the workflow
workflow_config = config.ShotGun()
workflow.add_argument("input-extension", desc="the input file extension", default="fastq.gz", choices=["fastq.gz","fastq","fq.gz","fq","fasta","fasta.gz"])
workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=1)
workflow.add_argument("bypass-taxonomic-profiling", desc="do not run the taxonomic profiling tasks (a tsv profile for each sequence file must be included in the input folder using the same sample name)", action="store_true")
workflow.add_argument("strain-profiling-options", desc="additional options when running the strain profiling step", default="")
workflow.add_argument("max-strains", desc="the max number of strains to profile", default=20, type=int)

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
# return an error if no files are found
input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True)

### STEP #1: Run taxonomic profiling on all of the filtered files ###
if not args.bypass_taxonomic_profiling:
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow,
from biobakery_workflows import document_templates, utilities

# import the files for descriptions and paths
from biobakery_workflows import files
import os

# create a workflow instance, providing the version number and description
# remove the input folder option as it will be replaced with multiple input files
workflow = Workflow(version="0.1", remove_options=["input"],
                    description="A workflow for 16S visualization")
                    
# add the custom arguments to the workflow 
# create a custom description for the input argument listing all expected input files
input_desc="A folder containing the final products from the 16s data workflow.\n\nThe input folder must include the following:\n\n"  
                      
workflow.add_argument("input",desc=input_desc,required=True)

# add the custom arguments to the workflow
workflow.add_argument("project-name",desc="the name of the project",required=True)
workflow.add_argument("input-metadata",desc="the metadata file (samples as columns or rows)")
workflow.add_argument("input-picard",desc="the folder of picard quality score files")
workflow.add_argument("input-picard-extension",desc="the extensions for the picard quality score files", default="quality_by_cycle_metrics")
workflow.add_argument("metadata-categorical",desc="the categorical features", action="append", default=[])
workflow.add_argument("metadata-continuous",desc="the continuous features", action="append", default=[])
workflow.add_argument("metadata-exclude",desc="the features to exclude", action="append", default=[])
workflow.add_argument("exclude-workflow-info",desc="do not include data processing task info in report", action="store_true")
workflow.add_argument("format",desc="the format for the report", default="pdf", choices=["pdf","html"])

# get the arguments from the command line
args = workflow.parse_args()
Beispiel #16
0
import os

from anadama2 import Workflow

# to run provide the new workflow run input and output folders
# $ python anadama2_add_files_to_database.py --input $NEW_INPUT_FOLDER --output $NEW_OUTPUT_FOLDER

workflow = Workflow()

# add the list of possible file extensions
workflow.add_argument(
    "input-extensions",
    desc="the comma-delimited list of extensions of the input files",
    default="txt,tsv,fastq,fastq.gz,log,sam")
args = workflow.parse_args()


# get all of the files in the input folder with the extensions provided
def get_files_to_add(input_folder):
    posible_extensions = set(args.input_extensions.split(","))
    input_files = []
    for folder, directories, files in os.walk(input_folder):
        if not ".anadama" in folder:
            for filename in files:
                if any(
                        map(lambda ext: filename.endswith(ext),
                            posible_extensions)):
                    input_files.append(os.path.join(folder, filename))
    return input_files

Beispiel #17
0
#!/usr/bin/env python

from anadama2 import Workflow
import os

workflow = Workflow(version="0.0.2", description="A workflow to run PanPhlAn")

workflow.add_argument("threads",
                      default=1,
                      desc="number of threads for panphlan to use")
workflow.add_argument("dbfolder",
                      default=None,
                      desc="folder containing database")
workflow.add_argument("filesfile",
                      default=None,
                      desc="file with filepaths to run on (relative to input)")
workflow.add_argument("ref", default=None, desc="name of reference db")
workflow.add_argument(
    "refs",
    default=None,
    desc="file with list of references (relative to dbfolder)")

args = workflow.parse_args()

in_files = workflow.get_input_files(".fastq.gz")
out_files = workflow.name_output_files(name=in_files,
                                       tag="panphlan_map",
                                       extension="csv.bz2")

if args.filesfile:
    with open(args.filesfile) as f:
    ["genefamilies_norm_ratio", "ecs_norm_ratio", "paths_norm_ratio"]
}

# create a custom description for the input argument listing all expected input files
input_desc = "A folder containing the final products from the wmgx_mwtx data workflow.\n\nThe input folder should include the following:\n\n"
input_desc += "Whole Metagenome Shotgun\n---------------------------------\n"
input_desc += files.ShotGun.list_file_path_description(
    files.ShotGun.wmgx_folder_name, wmgx_input_files)
input_desc += "\n\nWhole Metatranscriptome Shotgun\n---------------------------------\n"
input_desc += files.ShotGun.list_file_path_description(
    files.ShotGun.wmtx_folder_name, wmtx_input_files)
input_desc += "\n\nRNA/DNA Norm\n---------------------------------\n"
input_desc += files.ShotGun.list_file_path_description("", norm_input_files)

# add the custom arguments to the workflow
workflow.add_argument("input", desc=input_desc, required=True)
workflow.add_argument("project-name",
                      desc="the name of the project",
                      required=True)
workflow.add_argument(
    "introduction-text",
    desc="the text to include in the intro of the report",
    default=
    "The data was run through the standard workflow for whole metagenome and metatranscriptome shotgun sequencing."
)
workflow.add_argument(
    "exclude-workflow-info",
    desc="do not include data processing task info in report",
    action="store_true")
workflow.add_argument("format",
                      desc="the format for the report",
# The fasta reads will be any of the sample reads that map to a marker associated with one of the 
# species in the "--species-list" file. This file should have one species per line and be formatted 
# with the metaphlan2 species naming convention. More specifically, the species file should list
# one per line with metaphlan2 format (ie "s__Gemella_sanguinis") and for unknown species
# include the genus in this file (ie "s__Gemella_unclassified" should be included in the file as "g__Gemella").
# The metaphlan2 pkl database is also required for this script to run and can be provided 
# with the option "--pkl-database". 

SAM_READ_NAME_INDEX = 0
SAM_REFERENCE_NAME_INDEX = 2
SAM_SEQ_INDEX = 9

workflow = Workflow()

# input folder should have sam alignment files from metaphlan2 run
workflow.add_argument("pkl-database", desc="MetaPhlAn2 pkl database", default="metaphlan2_db/mpa_v20_m200.pkl")
workflow.add_argument("species-list", desc="the list of species to pull reads for", default="species_list.txt")
workflow.add_argument("input-tag-extension", desc="the file name tag and extension", default="_bowtie2.sam")
args = workflow.parse_args()

def find_reads(task):
    # read in the species
    with open(args.species_list) as file_handle:
        species_list = [taxon.rstrip() for taxon in file_handle.readlines()]

    db = pickle.load(bz2.BZ2File(args.pkl_database, 'r'))

    marker_to_species={}
    for marker,info in db['markers'].items():
        if info['clade'] in species_list:
            marker_to_species[marker]=info['clade']
Beispiel #20
0
import datetime

# constants
ARCHIVE_FOLDER = "/opt/archive_folder/"
COUNT_FILE = os.path.join(ARCHIVE_FOLDER, "data_deposition_counts.csv")
PUBLIC_COUNT_FILE = os.path.join(ARCHIVE_FOLDER,
                                 "data_deposition_counts_public.csv")

# create a workflow to check the md5sums for each file
from anadama2 import Workflow
from biobakery_workflows import utilities

# create a workflow and get the arguments
workflow = Workflow(remove_options=["input"])
workflow.add_argument("input-upload",
                      desc="the folder of raw uploaded data",
                      required=True)
workflow.add_argument("input-processed",
                      desc="the folder of processed data",
                      required=True)
workflow.add_argument("key",
                      desc="the key file to use for the transfer",
                      required=True)
workflow.add_argument("user",
                      desc="the user id for the transfer",
                      required=True)
workflow.add_argument("remote",
                      desc="the remote host name for the transfer",
                      required=True)
workflow.add_argument("study", desc="the name of the study", required=True)
workflow.add_argument("output-transfer",
Beispiel #21
0
#!/usr/bin/env python

from anadama2 import Workflow
import os

workflow = Workflow(version="0.0.1", description="A workflow to run PanPhlAn")

workflow.add_argument("dbfolder",
                      default=None,
                      desc="folder containing database")
workflow.add_argument("ref", default=None, desc="name of reference db")
workflow.add_argument(
    "refs",
    default=None,
    desc="file with list of references (relative to dbfolder)")

args = workflow.parse_args()

cmd = "panphlan_profile.py -c {0} -i {0}/ --o_dna [target] --add_strains"

if args.dbfolder:
    cmd += " --i_bowtie2_indexes {}".format(args.dbfolder)

if args.ref:
    refs = [args.ref]
elif args.refs:
    r = open(args.refs, "r")
    refs = [l.strip() for l in r]
    r.close()

for ref in refs:
Beispiel #22
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.

    Args:
        None
    Requires:
        None
    Returns:
        anadama2.Workflow: The workflow object for this pipeline
        anadama2.cli.Configuration: Arguments passed into this workflow.
    """
    workflow = Workflow(version='1.0',
                        description='A workflow to handle HMP2 '
                        'WGS data.',
                        remove_options=['input', 'output'])
    workflow.add_argument('manifest-file',
                          desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('config-file',
                          desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('metadata-file',
                          desc='Accompanying metadata file '
                          'for the provided data files.',
                          default=None)
    workflow.add_argument('threads',
                          desc='number of threads/cores for each '
                          'task to use',
                          default=1)
    workflow.add_argument('threads-kneaddata',
                          desc='OPTIONAL. A specific '
                          'number of threads/cores to use just for the '
                          'kneaddata task.',
                          default=None)
    workflow.add_argument('threads-metaphlan',
                          desc='OPTIONAL. A specific '
                          'number of threads/cores to use just for the '
                          'metaphlan2 task.',
                          default=None)
    workflow.add_argument(
        'threads-humann',
        desc='OPTIONAL. A specific '
        'number of threads/cores to use just for the humann2 '
        'task.',
        default=None)
    return workflow
Beispiel #23
0
"""

from anadama2 import Workflow
import os, sys, fnmatch

from biobakery_workflows.tasks import sixteen_s, dadatwo, general
from biobakery_workflows import utilities, config, files

# create a workflow instance, providing the version number and description
workflow = Workflow(version="0.1",
                    description="A workflow for 16S sequencing data")

# add the custom arguments to the workflow
workflow_config = config.SixteenS()
workflow.add_argument("method",
                      desc="method to process 16s workflow",
                      default="vsearch",
                      choices=["usearch", "dada2", "vsearch", "its"])
workflow.add_argument("dada-db",
                      desc="reference database for dada2 workflow",
                      default="silva",
                      choices=["gg", "rdp", "silva", "unite"])
workflow.add_argument(
    "usearch-db",
    desc=
    "full paths for the reference databases (fna and taxonomy, comma delimited) for the usearch workflow",
    default=",".join([
        workflow_config.greengenes_fasta, workflow_config.greengenes_taxonomy
    ]))
workflow.add_argument("bypass-functional-profiling",
                      desc="bypass the functional profiling tasks",
                      action="store_true")
import os
from glob import glob
from anadama2 import Workflow
from anadama2.tracked import TrackedExecutable

# Setting the version of the workflow and short description
workflow = Workflow(
    version="0.0.1",  #Update the version as needed
    description="Analysis Template"  #Update the description as needed
)

# Setting additional custom arguments for workflow - run.py
workflow.add_argument(name="lines",
                      desc="Number of lines to trim [default: 10]",
                      default="10")

workflow.add_argument(
    name="metadata",
    desc="Metadata for performing analysis [default: input/metadata.tsv]",
    default="input/metadata.tsv")

# Parsing the workflow arguments
args = workflow.parse_args()

#Loading the config setting
args.config = 'etc/config.ini'

# AnADAMA2 example workflow.do
workflow.do("ls /usr/bin/ | sort > [t:output/global_exe.txt]")  #Command
workflow.do("ls $HOME/.local/bin/ | sort > [t:output/local_exe.txt]")  #Command
Beispiel #25
0
# import the utilities functions and config settings from biobakery_workflows
from biobakery_workflows import utilities, config

# create a workflow instance, providing the version number and description
# the version number will appear when running this script with the "--version" option
# the description will appear when running this script with the "--help" option
workflow = Workflow(
    version="0.1",
    description="A workflow for whole metagenome shotgun sequences")

# add the custom arguments to the workflow
workflow_config = config.ShotGun()
workflow.add_argument("input-extension",
                      desc="the input file extension",
                      default="fastq.gz",
                      choices=[
                          "fastq.gz", "fastq", "fq.gz", "fq", "fasta",
                          "fasta.gz", "fastq.bz2", "fq.bz2"
                      ])
workflow.add_argument("barcode-file", desc="the barcode file", default="")
workflow.add_argument("dual-barcode-file",
                      desc="the string to identify the dual barcode file",
                      default="")
workflow.add_argument("index-identifier",
                      desc="the string to identify the index files",
                      default="_I1_001")
workflow.add_argument(
    "min-pred-qc-score",
    desc="the min phred quality score to use for demultiplexing",
    default=2)
workflow.add_argument("threads",
# -*- coding: utf-8 -*-
from anadama2 import Workflow

# create a workflow instance, providing the version number and description
# the version number will appear when running this script with the "--version" option
# the description will appear when running this script with the "--help" option
workflow = Workflow(version="0.1", description="A workflow to run KneadData")

# add the custom arguments to the workflow
workflow.add_argument("kneaddata-db", desc="the kneaddata database", default="/work/code/kneaddata/db/")
workflow.add_argument("input-extension", desc="the input file extension", default="fastq")
workflow.add_argument("threads", desc="number of threads for knead_data to use", default=1)

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
in_files = workflow.get_input_files(extension=args.input_extension)

# get a list of output files, one for each input file, with the kneaddata tag
out_files = workflow.name_output_files(name=in_files, tag="kneaddata")

# create a task for each set of input and output files to run kneaddata
workflow.add_task_group(
    "kneaddata --input [depends[0]] --output [output_folder] --reference-db [kneaddata_db] --threads [threads]",
    depends=in_files,
    targets=out_files,
    output_folder=args.output,
    kneaddata_db=args.kneaddata_db,
    threads=args.threads)
Beispiel #27
0
# species in the "--species-list" file. This file should have one species per line and be formatted
# with the metaphlan species naming convention. More specifically, the species file should list
# one per line with metaphlan format (ie "s__Gemella_sanguinis") and for unknown species
# include the genus in this file (ie "s__Gemella_unclassified" should be included in the file as "g__Gemella").
# The metaphlan pkl database is also required for this script to run and can be provided
# with the option "--pkl-database".

SAM_READ_NAME_INDEX = 0
SAM_REFERENCE_NAME_INDEX = 2
SAM_SEQ_INDEX = 9

workflow = Workflow()

# input folder should have sam alignment files from metaphlan run
workflow.add_argument("pkl-database",
                      desc="MetaPhlAn pkl database",
                      default="metaphlan_db/mpa_v30_CHOCOPhlAn_201901.pkl")
workflow.add_argument("species-list",
                      desc="the list of species to pull reads for",
                      default="species_list.txt")
workflow.add_argument("input-tag-extension",
                      desc="the file name tag and extension",
                      default="_bowtie2.sam")
args = workflow.parse_args()


def find_reads(task):
    # read in the species
    with open(args.species_list) as file_handle:
        species_list = [taxon.rstrip() for taxon in file_handle.readlines()]
Beispiel #28
0
def parse_cli_arguments ():
	'''
	 Parses any command-line arguments passed into the workflow.
	the version number will appear when running this script with the "--version" option
	the description will appear when running this script with the "--help" option
	create a workflow instance, providing the version number and description
	'''

	tmp_output = os.path.abspath(config.working_dir)

	workflow = Workflow(version = VERSION, description = "A workflow for MetaWIBELE prioritization", remove_options=["output"])

	# add the custom arguments to the workflow
	workflow.add_argument("threads",
						desc = "number of threads/cores for each task to use",
						default = None)
	workflow.add_argument("prioritization-config",
	                    desc = "the configuration file for prioritization",
	                    default = None)
	workflow.add_argument("vignette-config",
	                    desc = "the file with specific functions of interest used as binary filtering for prioritization",
	                    default = "none")
	workflow.add_argument("bypass-mandatory",
	                     desc = "do not prioritize protein families based on quantitative criteria (mandatory prioritization)",
						 action = "store_true")
	workflow.add_argument("bypass-optional",
	                     desc = "do not prioritize protein families based on selecting our for interested annotations (optional prioritization)",
	                     action = "store_true")
	workflow.add_argument("bypass-finalized",
	                     desc = "do not finalize prioritized protein families",
						 action = "store_true")
	workflow.add_argument("selected-output",
	                    desc = "the output file name for the prioritized protein families by binary filtering",
	                    default = None)
	workflow.add_argument("basename",
						desc="specify the basename for output files",
						default = None)
	workflow.add_argument("input-annotation",
	                    desc = "provide the annotation file for protein families",
	                    required = True)
	workflow.add_argument("input-attribute",
	                    desc = "provide the annotation attribute file for protein families",
						required = True)
	workflow.add_argument("output",
	                    desc = "provide an output folder which the workflow database and log is written. By default, thet be written to the anadama2 folder of users' workding directory",
	                    default = tmp_output)

	return workflow
Beispiel #29
0
def parse_cli_arguments():
    """Parses any command-line arguments passed into the workflow.
    
    Args: 
        None
    Requires:
        None
    Returns:
        AnaDAMA2.Workflow: The workflow object for this pipeline.
    """
    workflow = Workflow(version='0.1',
                        description='A workflow to handle '
                        'refreshing and disseminating HMP2 metadata.',
                        remove_options=['input', 'output'])
    workflow.add_argument('manifest-file',
                          desc='Manifest file containing '
                          'files to process in this workflow run.')
    workflow.add_argument('config-file',
                          desc='Configuration file '
                          'containing parameters required by the workflow.')
    workflow.add_argument('metadata-file',
                          desc='If an existing metadata '
                          'file exists it can be supplied here. This metadata '
                          'file will be appended to instead of a whole new '
                          'metadata file being generated.')
    workflow.add_argument('studytrax-metadata-file',
                          desc='Accompanying '
                          'StudyTrax data all corresponding samples in the '
                          'HMP2 project.')
    workflow.add_argument('broad-sample-tracking-file',
                          desc='Broad Institute '
                          'sample tracking spreadsheet containing status of '
                          'sequence products generated.')
    workflow.add_argument('proteomics-metadata',
                          desc='PNNL-supplied metadata '
                          'spreadsheet.')
    workflow.add_argument('auxillary-metadata',
                          action='append',
                          default=[],
                          desc='Any auxillary metadata to be appeneded '
                          'to the final metadata table.')

    return workflow
Beispiel #30
0
def parse_cli_arguments():
    '''
	 Parses any command-line arguments passed into the workflow.
	the version number will appear when running this script with the "--version" option
	the description will appear when running this script with the "--help" option
	create a workflow instance, providing the version number and description
	'''

    workflow = Workflow(
        version=VERSION,
        description=
        "A workflow to preprocess shotgun sequencing reads of metagenomes "
        "with tasks of metagenomic assembly, gene calling, "
        "building gene catalogs and generating gene abundance for each sample."
    )

    # add the custom arguments to the workflow
    workflow.add_argument("threads",
                          desc="number of threads/cores for each task to use",
                          default=None)
    workflow.add_argument(
        "extension-paired",
        desc=
        "provide the extension for paired fastq files using comma to separate, e.g. .R1.fastq.gz,.R2.fastq.gz | .R1.fastq,.R2.fastq",
        default=None)
    workflow.add_argument("extension",
                          desc="provide the extension for all fastq files",
                          choices=[".fastq.gz", ".fastq"],
                          default=".fastq.gz")
    workflow.add_argument("gene-call-type",
                          desc="specify which type of gene calls will be used",
                          choices=['prokka', 'prodigal', 'both'],
                          default='prodigal')
    workflow.add_argument("bypass-assembly",
                          desc="do not run assembly",
                          action="store_true")
    workflow.add_argument("bypass-gene-calling",
                          desc="do not call ORFs",
                          action="store_true")
    workflow.add_argument("bypass-gene-catalog",
                          desc="do not build gene catalogs",
                          action="store_true")
    workflow.add_argument("output-basename",
                          desc="provide the basename for output files",
                          default=None)

    return workflow