def main(): parser = cmdline.get_argparse(description="Trench Run pipeline") args = parser.parse_args() if args.target_tasks: cmdline.run(args) else: pipeline_run(publish_data)
def configure(config, args): """ Setup runtime from config module/dict and command line args Parameters ---------- config: dict or Namespace Hold configurations used to initialize ApusConfig object args: list list of arguments to be passed to Ruffus.cmdline module Returns ------- apusconf: ApusConfig Hold configurations of the Apus option: Namespace Hold parsed command line arguments """ if isinstance(config, dict): apusconf = ApusConfig(**config) else: apusconf = ApusConfig(config=config) parser = cmdline.get_argparse(description=""" +- Astronomy Pipeline Using ruffuS, specifically tweaked for PostCalib -+ """, version=ruffus.__version__, prog='postcalib run ... -a ') parser.add_argument('-r', '--redo-all', action='store_true', help='force redo all tasks') parser.add_argument('-l', '--list-tasks', action='store_true', help='list the task names and exit') parser.set_defaults(verbose=[ '0', ], log_file=os.path.join(apusconf.logdir, apusconf.log_file), history_file=os.path.join(apusconf.logdir, apusconf.history_file)) option = parser.parse_args(args) # handle logger logger, logger_mutex = make_shared_logger_and_proxy( logger_factory, apusconf.jobkey, [option.log_file, option.verbose]) apusconf.logger = logger apusconf.logger_mutex = logger_mutex return apusconf, option
def parse_command_line(): '''Parse the command line arguments of the pipeline''' parser = cmdline.get_argparse(description='Variant calling pipeline', ignored_args = ["version"] ) parser.add_argument('--config', type=str, default=DEFAULT_CONFIG_FILE, help='Pipeline configuration file in YAML format, defaults to {}' \ .format(DEFAULT_CONFIG_FILE)) parser.add_argument('--jobscripts', type=str, default=DEFAULT_JOBSCRIPT_DIR, help='Directory to store cluster job scripts created by the ' \ 'pipeline, defaults to {}'.format(DEFAULT_JOBSCRIPT_DIR)) parser.add_argument('--version', action='version', version='%(prog)s ' + version) return parser.parse_args()
def parse_command_line(version): '''Parse the command line arguments of the pipeline''' parser = cmdline.get_argparse(description='RAD-Seq pipeline', ignored_args = ["version"] ) parser.add_argument('--config', type=str, default=DEFAULT_CONFIG_FILE, help='Pipeline configuration file in YAML format, defaults to {}' \ .format(DEFAULT_CONFIG_FILE)) parser.add_argument('--jobscripts', type=str, default=DEFAULT_JOBSCRIPT_DIR, help='Directory to store cluster job scripts created by the ' \ 'pipeline, defaults to {}'.format(DEFAULT_JOBSCRIPT_DIR)) parser.add_argument('--version', action='version', version='%(prog)s ' + version) return parser.parse_args()
def parse_command_line(): '''Parse the command line arguments of the pipeline''' parser = cmdline.get_argparse(description='A variant discovery pipeline', ignored_args=["version"]) parser.add_argument('--config', type=str, default=DEFAULT_CONFIG_FILE, help='Pipeline configuration file in YAML format, defaults to {}' \ .format(DEFAULT_CONFIG_FILE)) parser.add_argument('--jobscripts', type=str, default=DEFAULT_JOBSCRIPT_DIR, help='Directory to store cluster job scripts created by the ' \ 'pipeline, defaults to {}'.format(DEFAULT_JOBSCRIPT_DIR)) parser.add_argument('--mode', type=str, default='map', choices=['map', 'process'], help='set to "map" to process fastq files up to haplotype caller' \ ', or "process" to process multiple map directories. Default is map') parser.add_argument('--version', action='version', version='%(prog)s ' + version) return parser.parse_args()
def test_argparse(self): """ Same as above but setting up options using ruffus.cmdline.get_argparse --verbose on its own increases the verbosity by one --verbose NNN (re)sets the verbosity to NNN whatever the previous state --verbose NNN:MMM sets the verbose_abbreviated_path to MMM """ parser = cmdline.get_argparse(description='WHAT DOES THIS PIPELINE DO?') import sys sys.argv = ["test", "--verbose", "--verbose=2"] options = parser.parse_args() handle_verbose(options) self.assertTrue(options.verbose==2) self.assertTrue(options.verbose_abbreviated_path==None) sys.argv = ["test", "--verbose", "--verbose=3", "--verbose"] options = parser.parse_args() handle_verbose(options) self.assertTrue(options.verbose==4) self.assertTrue(options.verbose_abbreviated_path==None) sys.argv = ["test", "--verbose", "--verbose=5:3", "--verbose"] options = parser.parse_args() handle_verbose(options) self.assertTrue(options.verbose==6) self.assertTrue(options.verbose_abbreviated_path==3) sys.argv = ["test", "--verbose", "--verbose=5:3", "--verbose", "--verbose=7", "--verbose"] options = parser.parse_args() handle_verbose(options) self.assertTrue(options.verbose==8) self.assertTrue(options.verbose_abbreviated_path==3) sys.argv = ["test", "--verbose", "--verbose=5:3", "--verbose", "--verbose=7:5", "--verbose"] options = parser.parse_args() handle_verbose(options) self.assertTrue(options.verbose==8) self.assertTrue(options.verbose_abbreviated_path==5)
) sys.exit(ExitCode.missing_dependency) check_pil_encoder('jpg', 'JPEG') check_pil_encoder('zlib', 'PNG') # ------------- # Parser parser = cmdline.get_argparse( prog="ocrmypdf", description="Generate searchable PDF file from an image-only PDF file.", version=VERSION, fromfile_prefix_chars='@', ignored_args=[ 'touch_files_only', 'recreate_database', 'checksum_file_name', 'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format', 'forced_tasks', 'target_tasks', 'use_threads', 'jobs', 'log_file']) parser.add_argument( 'input_file', help="PDF file containing the images to be OCRed") parser.add_argument( 'output_file', help="output searchable PDF file") parser.add_argument( '-l', '--language', action='append', help="languages of the file to be OCRed") parser.add_argument(
"See installation instructions for your platform here:\n" " https://pillow.readthedocs.org/installation.html") sys.exit(ExitCode.missing_dependency) check_pil_encoder('jpg', 'JPEG') check_pil_encoder('zlib', 'PNG') # ------------- # Parser parser = cmdline.get_argparse( prog="ocrmypdf", description="Generate searchable PDF file from an image-only PDF file.", version=VERSION, fromfile_prefix_chars='@', ignored_args=[ 'touch_files_only', 'recreate_database', 'checksum_file_name', 'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format', 'forced_tasks', 'target_tasks', 'use_threads', 'jobs', 'log_file' ]) parser.add_argument('input_file', help="PDF file containing the images to be OCRed") parser.add_argument('output_file', help="output searchable PDF file") parser.add_argument('-l', '--language', action='append', help="languages of the file to be OCRed") parser.add_argument( '-j', '--jobs',
elif not params['alignments'].has_key('genome_index') or params['alignments']['genome_index'] is None or\ type(params['alignments']['genome_index']) is not list or len(params['alignments']['genome_index']) != 2: error = 'no proper GMAP index provided' elif not params['annotations'].has_key('genome_fasta') or params['annotations']['genome_fasta'] is None: error = 'no genome fasta provided' elif not params['annotations'].has_key('gtf') or params['annotations']['gtf'] is None: error = 'no gtf provided' if not args.only_splicing: if not params['alignments'].has_key('transcripts_fasta') or params['alignments']['transcripts_fasta'] is None: error = 'no transcripts fasta provided' if error is not None: sys.exit('TAP ABORTED: %s' % error.upper()) parser = cmdline.get_argparse(description='TAP pipeline') parser.add_argument('sample', type=str, help='sample name') parser.add_argument('outdir', type=str, help='output directory') parser.add_argument('--bf', type=str, help='path to bloomfilter') parser.add_argument('--fq', type=str, nargs='+', help='input gzipped fastqs') parser.add_argument('--fq_list', type=str, help='text file of input fastq paths') parser.add_argument('--nprocs', type=int, default=32, help='number of threads/processes. Default=32') parser.add_argument('--remove_fq', action='store_true', help='remove intermediate fastqs') parser.add_argument('--only_assembly', action='store_true') parser.add_argument('--only_sv', action='store_true') parser.add_argument('--only_splicing', action='store_true') parser.add_argument('--genome_bam', type=str, help='genome bam(for detecting splice-site variants)') parser.add_argument('--params', type=str, help='parameters file') assembly = parser.add_argument_group('assembly') assembly.add_argument('--k', type=int, nargs='+', help='k sizes for assembly') assembly.add_argument('--readlen', type=int, help='read length')
#!/usr/bin/env python from glob import glob import glob import sys, os, fnmatch import re from ruffus import * import ruffus.cmdline as cmdline from subprocess import check_call from ruffus.drmaa_wrapper import run_job, error_drmaa_job import drmaa ### parse command line for arguments parser = cmdline.get_argparse(description="Chela's Pipeline") parser.add_argument('-i', '--input', metavar="FILE", help = "Fastq files") parser.add_argument('--cuffdiff_file', metavar="FILE", help = "cuffdiff comparison instructions") parser.add_argument('--basedir', metavar="DIR", help = "base directory") parser.add_argument('--aligner', metavar="choice", help = "choice of aligner; enter hisat or star") parser.add_argument('--gtf', metavar="choice", help = "choice of gtf; enter all_transcripts, all_coding or ercc") #parser.add_argument('--flat_gff', metavar="choice", help = "flat gff for DEXseq") parser.add_argument('--kallisto', metavar="choice", help = "use kallisto?") parser.add_argument('--species', metavar="choice", help = "species" ) parser.add_argument('--stringtie', metavar="choice", help = "use stringtie?") options = parser.parse_args() cuffdiff_file = options.cuffdiff_file basedir=options.basedir aligner=options.aligner kallisto=options.kallisto stringtie=options.stringtie hisat_check=aligner=="hisat"
import sys import os.path import logging import os from Bio import SeqIO import re import argparse MODULES_KEY = 'MODULESHOME' if MODULES_KEY in os.environ: modules_init = os.path.join(os.environ[MODULES_KEY], 'init/python.py') execfile(modules_init) # need this for faToTwoBit module('load', 'blat/default') parser = cmdline.get_argparse( description='Use Ruffus to process .out files from genblastA') parser.add_argument('--working_directory', '-W', default='.') parser.add_argument('genome_filename', help='FASTA format genome file, must end in .fa or .fasta') parser.add_argument('hints_filename', help='Augustus hints filename, generate from exonerate') args = parser.parse_args() logger, logger_mutex = cmdline.setup_logging(__name__, args.log_file, args.verbose) FASTA_RE = r'\.(fa|fasta)$' os.chdir(args.working_directory) def safe_open(filename, mode='r'):
Ruffus pipeline for simple bowtie alignment """ from ruffus import * from big_wig_extras import BigWigExtras import ruffus.cmdline as cmdline import subprocess import logging import os import pprint import re import time parser = cmdline.get_argparse( description= 'Given a directory of sorted bam files, convert them to adjusted bigWigs') # Program arguments -- Most go straight to bowtie parser.add_argument( "--dir", help="Fullpath to the directory where the BAMS are located", required=True) parser.add_argument("--size", help="Fullpath to size file") #parser.add_argument("--reads", help="Fullpath to read stats file", required=True) parser.add_argument("--output", help="Fullpath to output dir", default="./") # parse the args options = parser.parse_args() # Kenny loggins
pipe.transform( name="convert_csv_files_to_tsv", task_func=csv_to_tsv, input=output_from("create_three_new_files"), filter=suffix(".csv"), output=".tsv", ) pipe.transform( name="calculate_md5", task_func=md5, input=output_from("convert_csv_files_to_tsv"), filter=suffix(".tsv"), output=".md5sum", ) return pipe if __name__ == "__main__": parser = cmdline.get_argparse(description="CNV Calling", ignored_args=["jobs"]) options = parser.parse_args() options.history_file = os.path.join(WORK_DIR, ".ruffus_history.sqlite") pipeline = build_pipeline() cmdline.run(options, multithead=3)
import sys import os.path import logging import os from Bio import SeqIO import re import argparse MODULES_KEY = 'MODULESHOME' if MODULES_KEY in os.environ: modules_init = os.path.join(os.environ[MODULES_KEY], 'init/python.py') execfile(modules_init) # need this for faToTwoBit module('load', 'blat/default') parser = cmdline.get_argparse(description='Use Ruffus to process .out files from genblastA') parser.add_argument('--working_directory', '-W', default='.') parser.add_argument('genome_filename', help='FASTA format genome file, must end in .fa or .fasta') parser.add_argument('hints_filename', help='Augustus hints filename, generate from exonerate') args = parser.parse_args() logger, logger_mutex = cmdline.setup_logging(__name__, args.log_file, args.verbose) FASTA_RE=r'\.(fa|fasta)$' os.chdir(args.working_directory) def safe_open(filename, mode='r'): try: file_obj = open(filename, mode) except IOError as e:
# ruffus imports from ruffus import * import ruffus.cmdline as cmdline import logging, time, os, subprocess from ebseq_extras import EbseqExtras # EMAIL import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email.Utils import COMMASPACE, formatdate from email import Encoders parser = cmdline.get_argparse( description= "This is a pipeline for RSEM alignment and subsequent DE gene analysis with EBseq" ) # Program arguments parser.add_argument( "--dir", help="Fullpath to the directory where the FASTQ reads are located", required=True) parser.add_argument("--cores", help="Number of cores to run RSEM on", default='10') parser.add_argument( "--index", help="Fullpath to the RSEM index in: /full/file/path/basename form", default="/data/refs/hg19/hg19-RSEM", required=True)
# # symbolic link relative to original directory so that the entire path # can be moved around with breaking everything # os.symlink( os.path.relpath(os.path.abspath(input_file), os.path.abspath(os.path.dirname(soft_link_name))), soft_link_name) hostname = socket.gethostname() sys_cfg = ConfigParser() sys_cfg.read('{}.sys.cfg'.format(hostname)) parser = cmdline.get_argparse( description= 'Perform exome analysis on alignment files in bam format using GATK.') parser.add_argument('--gatk', default=sys_cfg.get('program', 'gatk'), help='path to GATK jar file') parser.add_argument("input_bams", nargs='*') parser.add_argument( '--ref', required=True, help= 'specify which reference to use. It should be consistent with the reference used in alignment.' ) parser.add_argument( '--intervals', help= "One or more genomic intervals over which to operate. GATK engine parameter."
from ruffus import * from ruffus.cmdline import MESSAGE from collections import defaultdict import ruffus.cmdline as cmdline import subprocess import glob import re import os import argparse import sys import time DEFAULTADAPT = "AGATCGGAAGAGCACACG" parser = cmdline.get_argparse( description="ChemModSeqPipeline, a pipeline for processing ChemModSeq data" ) parser.add_argument("-f", dest="forwardreads", help="the path to your fastq files with forward reads.", metavar="data_1.fastq", nargs="*", default=None) parser.add_argument("-r", dest="reversereads", help="the path to your fastq files with reverse reads.", metavar="data_2.fastq", nargs="*", default=None) parser.add_argument("-g", "--gtf",
return sip_pipe # test_org_list = ["/current_projects/genomic_purity/test_files/Acaryochloris_marina_MBIC11017_uid12997"] # pipeline1a = make_sipp(org_list = test_org_list) ############################################################################### ## ## Commandline iterface ## ############################################################################### import ruffus.cmdline as cmdline parser = cmdline.get_argparse( description='Pipelines for genomic contaminant study', version="genomic_contamination_study_pipe.py v. 0.0.0.9000") # parser.add_argument('--pipeline', "-p", # type=str, # choices = ['sim_pure', 'sim_contam','real_pure','test_pipe'], # help="Defining which pipeline to run") parser.add_argument( '--config_file', "-cf", type=str, #metavar="config_file", help="yaml file with pipeline parameters") options = parser.parse_args()
########################################################## # Input: output of runDifferentialExpression function (.tsv files) # Output: a venn diagram showing overlap between DEGs # Type of operation: many-to-1 # Ruffus decorator used: merge @follows(volcanoPlot) # requires volcanoPlot to be run before @merge(runDifferentialExpression, 'data/differential_genes-venn_diagram.png') def testPipelineStep(infiles, outfile): # Print print('##### Step 4. Plotting venn diagram ({outfile})...'.format( **locals())) # Run run_r_job('plot_venn_diagram', infiles, outfile, run_locally=True) ################################################## ################################################## ########## Run pipeline ################################################## ################################################## # Get options specified from commandline options = cmdline.get_argparse().parse_args() # Run indicated steps if __name__ == '__main__': cmdline.run(options) print('Done!')
# ruffus imports from ruffus import * import ruffus.cmdline as cmdline import logging, time, os, subprocess from ebseq_extras import EbseqExtras # EMAIL import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email.Utils import COMMASPACE, formatdate from email import Encoders parser = cmdline.get_argparse(description="This is a pipeline for RSEM alignment and subsequent DE gene analysis with EBseq") # Program arguments parser.add_argument("--dir", help="Fullpath to the directory where the FASTQ reads are located", required=True) parser.add_argument("--cores", help="Number of cores to run RSEM on", default='10') parser.add_argument("--index", help="Fullpath to the RSEM index in: /full/file/path/basename form", default="/data/refs/hg19/hg19-RSEM", required=True) parser.add_argument("--output", help="Fullpath to output directory", default="./") parser.add_argument("--conf", help="Fullpath to conf tsv file, (<sample><frag-mean><frag-sd><cond>)", required=True) parser.add_argument("--fdr", help=" false discovery rate to use for DE", default=0.05) # reporting parser.add_argument("--emails", help="Emails to send results too", default="*****@*****.**", nargs="+")
import os import glob from ruffus import * import ruffus.cmdline as cmdline DATA_FP="/media/8TB_PLAYGROUND/home/ecl/ext/100_SCID/103_Virome/data_files/" fwd_files = glob.glob(DATA_FP + "*R1.fastq.gz") rev_files = glob.glob(DATA_FP + "*R2.fastq.gz") starting_files = list(zip(fwd_files, rev_files)) parser = cmdline.get_argparse(description="Pairs reads using PEAR") options = parser.parse_args() @transform(starting_files, suffix("R1.fastq.gz"), "assembled.fastq") def pair_reads(input_files, output_file): os.system("pear -j 4 -f {} -r {} -o {}".format(input_files[0], input_files[1], output_file)) cmdline.run(options)
# The Third pipeline is a clone of pipeline1b pipeline1c = pipeline1b.clone(new_name="pipeline1c") # Set the "originate" files for pipeline1c to ("e.1" and "f.1") # Otherwise they would use the original ("c.1", "d.1") pipeline1c.set_output(output=[]) pipeline1c.set_output(output=[tempdir + "/" + ss for ss in ("e.1", "f.1")]) # Join all pipeline1a-c to pipeline2 pipeline2 = make_pipeline2() pipeline2.set_input(input=[pipeline1a, pipeline1b, pipeline1c]) import ruffus.cmdline as cmdline parser = cmdline.get_argparse( description='Demonstrates the new Ruffus syntax in version 2.6') parser.add_argument('--cleanup', "-C", action="store_true", help="Cleanup before and after.") options = parser.parse_args() # standard python logger which can be synchronised across concurrent Ruffus tasks logger, logger_mutex = cmdline.setup_logging(__file__, options.log_file, options.verbose) logger.debug("\tRuffus Version = " + ruffus.__version__) if options.cleanup:
logger.debug("Can't unlink %s" % (soft_link_name)) with logging_mutex: logger.debug("os.symlink(%s, %s)" % (input_file, soft_link_name)) # # symbolic link relative to original directory so that the entire path # can be moved around with breaking everything # os.symlink( os.path.relpath(os.path.abspath(input_file), os.path.abspath(os.path.dirname(soft_link_name))), soft_link_name) hostname = socket.gethostname() sys_cfg = ConfigParser() sys_cfg.read('{}.sys.cfg'.format(hostname)) parser = cmdline.get_argparse(description='Perform exome analysis on alignment files in bam format using GATK.') parser.add_argument('--gatk', default=sys_cfg.get('program', 'gatk'), help='path to GATK jar file') parser.add_argument("input_bams", nargs='*') parser.add_argument('--ref', required=True, help='specify which reference to use. It should be consistent with the reference used in alignment.') parser.add_argument('--intervals', help="One or more genomic intervals over which to operate. GATK engine parameter.") parser.add_argument('--output-dir', dest='working_dir', default='.', help="directory for output files.") options = parser.parse_args("--verbose 1 --ref b37 --intervals test_data/sample_target.intervals " "-T remove_realign_interval -T remove_realigned_bam -T remove_read_group_file " "-T remove_intermediate_vcfs " "--output-dir test_output".split()) # standard python logger which can be synchronised across concurrent Ruffus tasks logger, logging_mutex = cmdline.setup_logging(__name__, options.log_file, options.verbose)
import sys import os, errno import glob import ruffus as rf import ruffus.cmdline as cmdline import pandas as pd import numpy as np import os, errno import yaml # import logging from trio import triodb # import warnings # warnings.filterwarnings("ignore", message=".*deprecation.*") # Configuration and command line options parser = cmdline.get_argparse(description='Pipeline using the TPC-H example.') parser.add_argument("--config") options = parser.parse_args() lg, lm = cmdline.setup_logging(__name__, options.log_file, options.verbose) # lg.setLevel(logging.INFO) if vars(options)['config'] == None: print "No config supplied." parser.print_help() sys.exit() with open(vars(options)['config'], 'r') as f: cfg = yaml.load(f) lg.info('pipeline:: ::config %s' % str(cfg)) SQL_PRINT_MAX = 1000 # =============================================================
config = yaml.safe_load(open("aws-config.yml")) #REFERENCE reference_list = (config['reference']) human_decoy = reference_list['human_decoy'] bed_list = (config['bed']) fh_bed = bed_list['FH'] software_list = (config['software']) gatk_picard = software_list['gatk_picard'] #picard_SortSam = software_list['picard_SortSam'] #mark_duplicates = software_list['mark_duplicates'] #gatk_jar = software_list['gatk'] parser = cmdline.get_argparse(description='Small pipeline for aws') parser.add_argument("--input") options = parser.parse_args() run_directory = options.input #check valid input dir provided if os.path.isdir(run_directory): os.chdir(run_directory) log_dir = os.path.join(run_directory, "logs") if not os.path.exists(log_dir): os.mkdir(log_dir, 0o755) extension = ("*.fastq.gz") input_files = []
# :) so i never have to touch excel import pandas as pd # for cummerbund import rpy2.robjects as robjects from rpy2.robjects.packages import importr # EMAIL import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email.Utils import COMMASPACE, formatdate from email import Encoders parser = cmdline.get_argparse(description='This pipeline provides a number of funtionalities for working with RNAseq data') # Program arguments parser.add_argument("--dir", help="Fullpath to the directory where the bams are located", required=True) parser.add_argument("--cores", help="Number of cores to run cuffdiff on", default='10') parser.add_argument("--output", help="Fullpath to output directory", default="./") parser.add_argument("--size", help="Fullpath to size file") parser.add_argument("--gtf", help="Fullpath to gtf file", required=True) parser.add_argument("--de-conf", help="fullpath to differential expresssion configuration file", required=True) parser.add_argument("--annotation-db", help="fullpath to the sqlite db file, <id><name><desc>") parser.add_argument("--annotation-file", help="fullpath to a tsv file of gene annotations, will create sqlite db") # reporting parser.add_argument("--emails", help="Emails to send DE results too", default="*****@*****.**", nargs="+")
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # options #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 import sys, os import os.path # add self to search path for testing exe_path = os.path.split(os.path.abspath(sys.argv[0]))[0] sys.path.append(os.path.abspath(os.path.join(exe_path, "..", ".."))) from ruffus import * import ruffus.cmdline as cmdline parser = cmdline.get_argparse( description='Tests legacy @files_re with combine()', version="%prog 1.0") parser.add_argument("-D", "--debug", action="store_true", help="Make sure output is correct and clean up.") options = parser.parse_args() # standard python logger which can be synchronised across concurrent Ruffus tasks logger, logger_mutex = cmdline.setup_logging(__name__, options.log_file, options.verbose) #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # imports
Kyle McChesney Ruffus pipeline for simple bowtie alignment """ from ruffus import * from big_wig_extras import BigWigExtras import ruffus.cmdline as cmdline import subprocess import logging import os import pprint import re import time parser = cmdline.get_argparse(description='Given a directory of sorted bam files, convert them to adjusted bigWigs') # Program arguments -- Most go straight to bowtie parser.add_argument("--dir", help="Fullpath to the directory where the BAMS are located", required=True) parser.add_argument("--size", help="Fullpath to size file") #parser.add_argument("--reads", help="Fullpath to read stats file", required=True) parser.add_argument("--output", help="Fullpath to output dir", default="./") # parse the args options = parser.parse_args() # Kenny loggins log = logging.getLogger(__name__) log.setLevel(logging.INFO) log_formatter = logging.Formatter('%(asctime)s {%(levelname)s}: %(message)s')
There is a PowerPoint presentation with details of all the ruffus decorators and which goes through the functions in this pipeline one by one. No input is required to run the pipeline. ''' # ruffus imports from ruffus import * from ruffus.combinatorics import * import ruffus.cmdline as cmdline import sys import os # this block is essential to run ruffus from the command line parser = cmdline.get_argparse(description='filter_transcripts') options = parser.parse_args() @originate(['a_originate.tsv', 'b_originate.tsv']) def exampleOriginate(outfile): ''' Example of ruffus originate decorator. @originate generates new files from scratch (0 to many operation) Here, this function generates two new files: a_originate.tsv b_originate.tsv '''
import ruffus.cmdline as cmdline # custom functions #from tophat_extras import TophatExtras # system imports import subprocess, logging, os, re, time import pandas as pd import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email.Utils import COMMASPACE, formatdate from email import Encoders parser = cmdline.get_argparse(description='seq_pipe: A pipeline for performing various NGS analysis tasks') # this is the only 'required' argument # will control which pipeline is run parser.add_argument("--analysis", help=" What type of analysis to perform", choices = ["bowtie","tophat","cuffdiff","DE","bigWig"], required=True) # Program arguments parser.add_argument("--input-dir", help="Fullpath to the directory where the input files are located", required=True) parser.add_argument("--cores", help="Number of cores to run multi-threaded programs on", default='3') parser.add_argument("--index", help="Fullpath to the bowtie2 index in: /full/file/path/basename form", default="/data/refs/hg19/hg19") parser.add_argument("--output", help="Fullpath to output directory", default="./") parser.add_argument("--size", help="Fullpath to size file") parser.add_argument("--gtf", help="Fullpath to gtf file") parser.add_argument("--paired", help="Indicates whether the reads in --dir are paired_end. MUST FOLLOW _1 _2 convention", default=False)
- DE with DESEQ """ import logging import os import subprocess import time from functools import partial import pandas as pd import ruffus.cmdline as cmdline import sh from ruffus import * from utils import guess_simple_design_matrix, log_line, make_fastq_list, send_report parser = cmdline.get_argparse(description='RSEM and deseq2 pipeline') parser.add_argument("--dir", help="Fullpath to the directory where the FASTQ reads are located", required=True) parser.add_argument("--index", help="Fullpath to the bowtie2 index in: /full/file/path/basename form", default="/data/refs/hg19/hg19-rsem") parser.add_argument("--output", help="Fullpath to output directory", default="./") parser.add_argument("--name", help="Optional experiment name", dest="exp_name", default="rsem-deseq")
import logging import os import pprint import re import time # EMAIL import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email.Utils import COMMASPACE, formatdate from email import Encoders parser = cmdline.get_argparse( description= 'Given a directory of NON-paired end reads -- Align them with bowtie') # Program arguments -- Most go straight to bowtie parser.add_argument( "--dir", help="Fullpath to the directory where the FASTQ reads are located", required=True, nargs='+') parser.add_argument("--cores", help="Number of cores to run bowtie on", default=10) parser.add_argument( "--index", help="Fullpath to the bowtie2 index in: /full/file/path/basename form", default="/data/refs/hg19/hg19")
# The Third pipeline is a clone of pipeline1b pipeline1c = pipeline1b.clone(new_name = "pipeline1c") # Set the "originate" files for pipeline1c to ("e.1" and "f.1") # Otherwise they would use the original ("c.1", "d.1") pipeline1c.set_output(output = []) pipeline1c.set_output(output = [tempdir + "/" + ss for ss in ("e.1", "f.1")]) # Join all pipeline1a-c to pipeline2 pipeline2 = make_pipeline2() pipeline2.set_input(input = [pipeline1a, pipeline1b, pipeline1c]) import ruffus.cmdline as cmdline parser = cmdline.get_argparse(description='Demonstrates the new Ruffus syntax in version 2.6') parser.add_argument('--cleanup', "-C", action="store_true", help="Cleanup before and after.") options = parser.parse_args() # standard python logger which can be synchronised across concurrent Ruffus tasks logger, logger_mutex = cmdline.setup_logging (__file__, options.log_file, options.verbose) logger.debug("\tRuffus Version = " + ruffus.__version__)
import subprocess import platform import glob import re import os import argparse import sys import time GFF_FIELDS = [ "seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes" ] parser = cmdline.get_argparse( description="CRAC pipeline for processing single-end multiplexed CRAC data" ) parser.add_argument("-f", dest="forwardreads", help="the path to your fastq read files.", metavar="data_1.fastq data_2.fastq ...", nargs="*", default=None) parser.add_argument("-g", "--gtf", dest="gtf", help="the path to your gtf annotation file", metavar="rRNA.gtf", default=None) parser.add_argument("-c", "--chromosome",
# slurm options #SBATCH --ntasks=1 #SBATCH --job-name="pipeline" #SBATCH --mail-type=ALL #SBATCH --output=ruffus/pipeline.%j.log # imports import os import re import datetime from ruffus import * import ruffus.cmdline as cmdline from subprocess import Popen, PIPE # command-line options parser = cmdline.get_argparse(description = 'Run LMD analysis pipeline.') parser.add_argument('--email', '-e', help ='Logon email address for JGI', type = str, dest = 'jgiLogon') parser.add_argument('--password', '-p', help ='JGI password', type = str, dest = 'jgiPassword') options = parser.parse_args() jgiLogon = options.jgiLogon jgiPassword = options.jgiPassword # parse SLURM job-id if 'SLURM_JOBID' in os.environ: slurm_jobid = os.environ.get('SLURM_JOBID')
suffix("-appendAlign.sam"),"-sam-report.tsv") sip_pipe.set_head_tasks([sip_pipe[task_originate]]) return sip_pipe # test_org_list = ["/current_projects/genomic_purity/test_files/Acaryochloris_marina_MBIC11017_uid12997"] # pipeline1a = make_sipp(org_list = test_org_list) ############################################################################### ## ## Commandline iterface ## ############################################################################### import ruffus.cmdline as cmdline parser = cmdline.get_argparse(description='Pipelines for genomic contaminant study', version = "genomic_contamination_study_pipe.py v. 0.0.0.9000") # parser.add_argument('--pipeline', "-p", # type=str, # choices = ['sim_pure', 'sim_contam','real_pure','test_pipe'], # help="Defining which pipeline to run") parser.add_argument('--config_file', "-cf", type=str, #metavar="config_file", help="yaml file with pipeline parameters") options = parser.parse_args()
import os import glob from ruffus import * import ruffus.cmdline as cmdline DATA_FP = "/media/8TB_PLAYGROUND/home/ecl/ext/100_SCID/103_Virome/data_files/" fwd_files = glob.glob(DATA_FP + "*R1.fastq.gz") rev_files = glob.glob(DATA_FP + "*R2.fastq.gz") starting_files = list(zip(fwd_files, rev_files)) parser = cmdline.get_argparse(description="Pairs reads using PEAR") options = parser.parse_args() @transform(starting_files, suffix("R1.fastq.gz"), "assembled.fastq") def pair_reads(input_files, output_file): os.system("pear -j 4 -f {} -r {} -o {}".format(input_files[0], input_files[1], output_file)) cmdline.run(options)
parser = cmdline.get_argparse( prog="ocrmypdf", version=VERSION, fromfile_prefix_chars='@', ignored_args=[ 'touch_files_only', 'recreate_database', 'checksum_file_name', 'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format', 'forced_tasks', 'target_tasks', 'use_threads', 'jobs', 'log_file' ], formatter_class=argparse.RawDescriptionHelpFormatter, description="""\ Generates a searchable PDF or PDF/A from a regular PDF. OCRmyPDF rasterizes each page of the input PDF, optionally corrects page rotation and performs image processing, runs the Tesseract OCR engine on the image, and then creates a PDF from the OCR information. """, epilog="""\ OCRmyPDF attempts to keep the output file at about the same size. If a file contains losslessly compressed images, and output file will be losslessly compressed as well. PDF is a page description file that attempts to preserve a layout exactly. A PDF can contain vector objects (such as text or lines) and raster objects (images). A page might have multiple images. OCRmyPDF is prepared to deal with the wide variety of PDFs that exist in the wild. When a PDF page contains text, OCRmyPDF assumes that the page has already been OCRed or is a "born digital" page that should not be OCRed. The default behavior is to exit in this case without producing a file. You can use the option --skip-text to ignore pages with text, or --force-ocr to rasterize all objects on the page and produce an image-only PDF as output. ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf ocrmypdf --force-ocr word_document.pdf output.pdf If you are concerned about long-term archiving of PDFs, use the default option --output-type pdfa which converts the PDF to a standardized PDF/A-2b. This converts images to sRGB colorspace, removes some features from the PDF such as Javascript or forms. If you want to minimize the number of changes made to your PDF, use --output-type pdf. If OCRmyPDF is given an image file as input, it will attempt to convert the image to a PDF before processing. For more control over the conversion of images to PDF, use the Python package img2pdf or other image to PDF software. For example, this command uses img2pdf to convert all .png files beginning with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and sending the result to OCRmyPDF through a pipe. img2pdf is a dependency of ocrmypdf so it is already installed. img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf """)
import subprocess import logging import os import pprint import re import time # EMAIL import smtplib from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email.Utils import COMMASPACE, formatdate from email import Encoders parser = cmdline.get_argparse(description='Given a directory of NON-paired end reads -- Align them with bowtie') # Program arguments -- Most go straight to bowtie parser.add_argument("--dir", help="Fullpath to the directory where the FASTQ reads are located", required=True, nargs='+') parser.add_argument("--cores", help="Number of cores to run bowtie on", default=10) parser.add_argument("--index", help="Fullpath to the bowtie2 index in: /full/file/path/basename form", default="/data/refs/hg19/hg19") parser.add_argument("--output", help="Fullpath to output directory", default="./") parser.add_argument("--size", help="Fullpath to size file") # optional arguments to control turning on and off tasks parser.add_argument("--wig", help="Whether or not wig files should be generated", type=bool, default=False) parser.add_argument("--bowtie", help="Whether to use bowtie one instead of two", type=bool, default=False) # parse the args options = parser.parse_args()