def test_newstyle_task(self): test_pipeline = Pipeline("test") test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\ .follows(mkdir(tempdir)) test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func=task3, input=task1, filter=regex(r"(.+)"), replace_inputs=ruffus.inputs( ((r"\1"), task2, "test_transform_inputs.*y")), output=r"\1.output") test_pipeline.merge(task4, (task3), tempdir + "final.output") test_pipeline.run([task4], multiprocess=10, verbose=0) correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format( tempdir=tempdir) with open(tempdir + "final.output") as ff: real_output = ff.read() self.assertEqual(correct_output, real_output)
def test_newstyle_mkdir (self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e')) test_pipeline.run(multiprocess = 10, verbose = 0) for d in 'abcde': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def make_pipeline1( pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform( task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform( task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform( task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def make_pipeline1(pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform(task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs( tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform(task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform(task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def test_newstyle_mkdir(self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e')) test_pipeline.run(multiprocess=10, verbose=0) for d in 'abcde': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\ .follows(mkdir(tempdir)) test_pipeline.split(task_func = step_4_split_numbers_into_chunks, input = tempdir + "random_numbers.list", output = tempdir + "*.chunks")\ .follows(create_random_numbers) test_pipeline.transform(task_func=step_5_calculate_sum_of_squares, input=step_4_split_numbers_into_chunks, filter=suffix(".chunks"), output=".sums") test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\ .posttask(lambda: sys.stdout.write(" hooray\n"))\ .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done"))) test_pipeline.run(multiprocess=50, verbose=0) output_file = os.path.join(tempdir, "variance.result") if not os.path.exists(output_file): raise Exception("Missing %s" % output_file)
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\ .follows(mkdir(tempdir)) test_pipeline.split(task_func = step_4_split_numbers_into_chunks, input = tempdir + "random_numbers.list", output = tempdir + "*.chunks")\ .follows(create_random_numbers) test_pipeline.transform(task_func = step_5_calculate_sum_of_squares, input = step_4_split_numbers_into_chunks, filter = suffix(".chunks"), output = ".sums") test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\ .posttask(lambda: sys.stdout.write(" hooray\n"))\ .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done"))) test_pipeline.run(multiprocess = 50, verbose = 0) output_file = os.path.join(tempdir, "variance.result") if not os.path.exists (output_file): raise Exception("Missing %s" % output_file)
def test_newstyle_mkdir (self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e")))\ .posttask(touch_file(unicode(tempdir + "f"))) test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"]) test_pipeline.run(multiprocess = 10, verbose = 0) for d in 'abcdefgh': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def test_newstyle_mkdir(self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e")))\ .posttask(touch_file(unicode(tempdir + "f"))) test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"]) test_pipeline.run(multiprocess=10, verbose=0) for d in 'abcdefgh': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda : sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess = 50, verbose = 0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir)) test_pipeline.files(gwas_simulation, generate_simulation_params)\ .follows(setup_simulation_data)\ .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results"))) test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\ .posttask(lambda: sys.stdout.write("\nOK\n")) test_pipeline.run(multiprocess=50, verbose=0) for oo in "000.mean", "001.mean": results_file_name = os.path.join(working_dir, oo) if not os.path.exists(results_file_name): raise Exception("Missing %s" % results_file_name)
def test_newstyle_task(self): """ Same as above but construct a new pipeline on the fly without decorators """ test_pipeline = Pipeline("test") test_pipeline.files(task1, None, tempdir + 'a.1')\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func=task2, input=task1, filter=regex(r".*"), output=tempdir + 'b.1') test_pipeline.files(task3, task2, tempdir + 'c.1') test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\ .follows(task3) test_pipeline.files(task5, task4, tempdir + "f.1") test_pipeline.run(multiprocess=10, verbose=0)
def test_newstyle_task (self): """ Same as above but construct a new pipeline on the fly without decorators """ test_pipeline = Pipeline("test") test_pipeline.files(task1, None, tempdir + 'a.1')\ .follows(mkdir(tempdir)) test_pipeline.transform(task_func = task2, input = task1, filter = regex(r".*"), output = tempdir + 'b.1') test_pipeline.files(task3, task2, tempdir + 'c.1') test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\ .follows(task3) test_pipeline.files(task5, task4, tempdir + "f.1") test_pipeline.run(multiprocess = 10, verbose = 0)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=prepare_files, input=None, output=tempdir + '*.animal')\ .follows(mkdir(tempdir, tempdir + "test"))\ .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n")) test_pipeline.collate(task_func=summarise_by_grouping, input=prepare_files, filter=regex(r'(.*/).*\.(.*)\.animal'), output=r'\1\2.results')\ .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n")) test_pipeline.run(multiprocess=10, verbose=0) check_species_correct()
def quality_boxplot(in_stats, out_boxplot): """draw a boxplot for the quality scores""" cmd = 'fastq_quality_boxplot_graph.sh -t %s -i %s -o %s' % (in_stats, in_stats, out_boxplot) check_call(cmd) @transform(quality_stats, suffix('.qual_stats'), '.qual_nuc_dist.png') def quality_nuc_dist(in_stats, out_dist): 'show the nucleotide distribution across the reads' cmd = 'fastx_nucleotide_distribution_graph.sh -t %s -i %s -o %s' % ( in_stats, in_stats, out_dist) check_call(cmd) @follows(mkdir('summaries')) @merge([original_reads, clip_adapter, trim_reads, trim_regex, filter_artifacts, filter_min_quality], #join('..', 'summaries', 'fastq.wikisummary')) 'fastq.wikisummary') def summarize_fastq_reads(in_fastq, out_summary): """Summarize fastq line counts""" with open(out_summary, 'w') as outfile: outfile.write(""" {| class="wikitable" |+ Summary of raw read counts !scope="col" | Dataset !scope="col" | Number of raw reads |- """) for infile in in_fastq:
import sh import ruffus import os import random import pyprind import gzip import simplejson as json data_dir = os.environ['DATA'] words_dir = os.path.join(data_dir, "words") # /usr/share/dict/words is a text file full of words on most unix systems @ruffus.follows(ruffus.mkdir(words_dir)) @ruffus.originate(os.path.join(words_dir, "words.txt")) def get_words(output_file): sh.cp("/usr/share/dict/words", output_file) sh.chmod("u+w", output_file) @ruffus.transform(get_words, ruffus.suffix(".txt"), ".alphabet.json") def build_alphabet_dictionary(input_file, output_file): characters = set() with open(input_file) as f: for line in f: characters = characters.union(line.rstrip()) alphabet = list(sorted(characters)) + ['PADDING', 'START', 'END']
dbh = sqlite3.connect(PARAMS["database_name"]) return dbh def connectToUCSC(): return PipelineGtfsubset.connectToUCSC( host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=PARAMS["ucsc_database"]) ############################################################ # Assembly ############################################################ @follows(mkdir('assembly.dir')) @files(os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fasta"), PARAMS['interface_contigs']) def buildContigSizes(infile, outfile): ''' Get contig sizes from indexed genome :term:`fasta` files and outputs to a text file. Parameters ---------- infile : str infile is constructed from the `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str outfile is a text format file that contains two columns, matched
outfile = '%(line)s_s_%(lane)s.intervals' % filename_re.search( infile).groupdict() yield [infile, 'intervals/%s' % outfile] @jobs_limit(1) @files(['sam/', 'sorted/'], None) @check_if_uptodate(check_if_clean) def clean_up(input_files, output_file): '''Clean up intermediate files''' print('Cleaning up intermeidate files: %s' % ', '.join(input_files)) call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False) # Find candidate intervals for realignment @follows(clean_up, mkdir('intervals', 'logs')) @files(create_intervals_generator) def create_intervals(input_file, output_file): '''Determine indel candidate intervals''' cmd_dict = CMD_DICT.copy() cmd_dict['infile'] = input_file cmd_dict['outfile'] = output_file pmsg('Interval Creation', input_file, output_file) gatk_cmd = '%(gatk)s --analysis_type RealignerTargetCreator ' + \ '--reference_sequence %(reference)s ' + \ '--DBSNP %(dbsnp)s ' + \ '--input_file %(infile)s ' + \ '--out %(outfile)s' call(gatk_cmd, cmd_dict)
# if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal ''' print(infile) print(REGEX_TRACK) PipelinePreprocess.makeAdaptorFasta( infile=infile, outfile=outfile, track=re.match(REGEX_TRACK, infile).groups()[0],
for key in P.PARAMS: if is_none(P.PARAMS[key]): P.PARAMS[key] = None elif is_on(P.PARAMS): P.PARAMS[key] = True # Global variables CREATE_BIGWIGS = P.PARAMS.get("run_options_bigwigs") CREATE_HUB = P.PARAMS.get("run_options_hub") ############# # Pipeline # ############# @follows(mkdir("statistics"), mkdir("statistics/fastqc")) @transform("*.fastq.gz", regex(r"(.*).fastq.gz"), r"statistics/fastqc/\1_fastqc.zip") def qc_reads(infile, outfile): """Quality control of raw sequencing reads""" statement = "fastqc -q -t %(pipeline_n_cores)s --nogroup %(infile)s --outdir statistics/fastqc" P.run( statement, job_queue=P.PARAMS["pipeline_cluster_queue"], job_threads=P.PARAMS["pipeline_n_cores"], job_condaenv=P.PARAMS["conda_env"], ) @merge(qc_reads, "statistics/readqc_report.html")
COLOR_OCTOMAP_FILE, COLOR_OCTOMAP_BT, MERGED_CLOUD_FILE,\ CAST_OCTOMAP_SINGLE, MERGED_VTK_FILE, STATIC_CLOUD_FILE,\ STATIC_VTK_FILE, DYNAMIC_CLOUD_FILE, DYNAMIC_VTK_FILE,\ FILTERED_CLOUDS_DIR, PARAMS_TO_LOAD,\ MERGED_COLOR_CLOUDS_DIR, MERGED_COLOR_CLOUD_FILE,\ MERGED_COLOR_VTK_FILE, LDR_UPSAMPLED_DIR, LDR_DIR,\ NO_TRANSFORM, CAMERA from pipeline_utils import file_num dirs = [ LDR_DIR, LDR_UPSAMPLED_DIR, POINTS_H5_DIR, PCD_DIR, PCD_DOWNSAMPLED_DIR, PCD_DOWNSAMPLED_NORMALS_DIR, ICP_TRANSFORMS_DIR, COLOR_DIR, COLOR_CLOUDS_DIR, MERGED_CLOUDS_DIR, MERGED_COLOR_CLOUDS_DIR, OCTOMAP_DIR, COLOR_OCTOMAP_DIR, FILTERED_CLOUDS_DIR ] MKDIRS = [mkdir(d) for d in dirs] # NOTE chdir into dset dir so can just specify relative paths to data os.chdir(DSET_DIR) DOWNLOADS = list() for f in REMOTE_FILES: DOWNLOADS.append([None, f]) @follows(*MKDIRS) @files(DOWNLOADS) def download_files(dummy, local_file): cmd = 'rsync -vr --ignore-existing %s/%s .' % (REMOTE_DATA_DIR, local_file) print cmd check_call(cmd, shell=True)
if not os.path.exists(PARAMS["annotations_database"]): raise ValueError( "can't find database '%s'" % PARAMS["annotations_database"]) statement = '''ATTACH DATABASE '%s' as annotations''' % \ (PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close() return dbh @follows(mkdir("geneset.dir")) @merge(PARAMS["annotations_interface_geneset_all_gtf"], "geneset.dir/reference.gtf.gz") def buildReferenceGeneSet(infile, outfile): ''' filter full gene set and add attributes to create the reference gene set Performs merge and filter operations: * Merge exons separated by small introns (< 5bp). * Remove transcripts with very long introns (`max_intron_size`) * Remove transcripts located on contigs to be ignored (`remove_contigs`) (usually: chrM, _random, ...) * (Optional) Remove transcripts overlapping repetitive sequences (`rna_file`) This preserves all features in a gtf file (exon, CDS, ...)
from bz2 import BZ2File from glob import iglob, glob from ruffus import follows, files, inputs, merge, mkdir, regex, transform from ..utils import CMD_DICT, call, pmsg, read_group_re def copy_sequence_generator(): for in_file in iglob('staging_area/*'): out_file = os.path.split(in_file)[-1] out_file = out_file.split(os.path.extsep)[0] + '.fastq.gz' out_file = os.path.join('fastq', out_file) yield [in_file, out_file] # Copy sequence from staging area @follows(mkdir('fastq', 'logs')) @files(copy_sequence_generator) def copy_sequence(input_file, output_file): '''Copy sequence files from staging area''' GZIP_HEADER = '\x1f\x8b' BZIP_HEADER = 'BZ' pmsg('Copying sequence files', input_file, output_file) # check if this is actually a gzipped file header = open(input_file).read(2) if header == GZIP_HEADER: input_file_handle = gzip.open(input_file, 'rb') elif header == BZIP_HEADER: input_file_handle = BZ2File(input_file, 'r') else: input_file_handle = open(input_file, 'rb')
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # # task1 # def do_write(file_name, what): with open(file_name, "a") as oo: oo.write(what) test_file = tempdir + "task.done" @follows(mkdir(tempdir, tempdir + "test")) @posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n")) @split(None, tempdir + '*.animal') def prepare_files(no_inputs, outputs): # cleanup previous for f in outputs: os.unlink(f) for grouping in species_list: for species_name in species_list[grouping]: filename = tempdir + "%s.%s.animal" % (species_name, grouping) with open(filename, "w") as oo: oo.write(species_name + "\n") #
from ruffus import follows, files, inputs, merge, mkdir, regex, transform from ..utils import CMD_DICT, call, pmsg, read_group_re def copy_sequence_generator(): for in_file in iglob('staging_area/*'): out_file = os.path.split(in_file)[-1] out_file = out_file.split(os.path.extsep)[0] + '.fastq.gz' out_file = os.path.join('fastq', out_file) yield [in_file, out_file] # Copy sequence from staging area @follows(mkdir('fastq', 'logs')) @files(copy_sequence_generator) def copy_sequence(input_file, output_file): '''Copy sequence files from staging area''' GZIP_HEADER = '\x1f\x8b' BZIP_HEADER = 'BZ' pmsg('Copying sequence files', input_file, output_file) # check if this is actually a gzipped file header = open(input_file).read(2) if header == GZIP_HEADER: input_file_handle = gzip.open(input_file, 'rb') elif header == BZIP_HEADER: input_file_handle = BZ2File(input_file, 'r') else: input_file_handle = open(input_file, 'rb')
from ruffus.task import active_if from pygr import worldbase, cnestedlist, seqdb import pybedtools from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import (genome_path, get_genome, cfg, get_chrom_sizes) import hts_waterworks.preprocessing as preprocessing #: the references to map against for this run (genome, transcriptome, etc) reference_genomes = [genome_path()] if cfg.getboolean('mapping', 'map_to_transcriptome'): reference_genomes.append('*_genes.transcriptome.fasta') @follows(mkdir('mapped')) def make_mapping_dir(): pass @active_if(cfg.getboolean('mapping', 'map_to_transcriptome')) @split('*_genes', regex(r'(.*)_genes$'), [r'\1_genes.transcriptome.fasta', r'\1_genes.transcriptome.seqdb', r'\1_genes.transcriptome.msa']) def make_transcriptome(in_genes, out_files): """Splice UTR's and exons from gene annotations into a transcriptome. Creates a fasta-file of resulting genes and a gene to genome alignment. """ out_fasta, out_db, out_msa = out_files
COLOR_OCTOMAP_DIR, OCTOMAP_FILE,\ COLOR_OCTOMAP_FILE, COLOR_OCTOMAP_BT, MERGED_CLOUD_FILE,\ CAST_OCTOMAP_SINGLE, MERGED_VTK_FILE, STATIC_CLOUD_FILE,\ STATIC_VTK_FILE, DYNAMIC_CLOUD_FILE, DYNAMIC_VTK_FILE,\ FILTERED_CLOUDS_DIR, PARAMS_TO_LOAD,\ MERGED_COLOR_CLOUDS_DIR, MERGED_COLOR_CLOUD_FILE,\ MERGED_COLOR_VTK_FILE, LDR_UPSAMPLED_DIR, LDR_DIR,\ NO_TRANSFORM, CAMERA from pipeline_utils import file_num dirs = [LDR_DIR, LDR_UPSAMPLED_DIR, POINTS_H5_DIR, PCD_DIR, PCD_DOWNSAMPLED_DIR, PCD_DOWNSAMPLED_NORMALS_DIR, ICP_TRANSFORMS_DIR, COLOR_DIR, COLOR_CLOUDS_DIR, MERGED_CLOUDS_DIR, MERGED_COLOR_CLOUDS_DIR, OCTOMAP_DIR, COLOR_OCTOMAP_DIR, FILTERED_CLOUDS_DIR] MKDIRS = [mkdir(d) for d in dirs] # NOTE chdir into dset dir so can just specify relative paths to data os.chdir(DSET_DIR) DOWNLOADS = list() for f in REMOTE_FILES: DOWNLOADS.append([None, f]) @follows(*MKDIRS) @files(DOWNLOADS) def download_files(dummy, local_file): cmd = 'rsync -vr --ignore-existing %s/%s .' % (REMOTE_DATA_DIR, local_file) print cmd check_call(cmd, shell=True) @follows('download_files')
def create_intervals_generator(): for infile in glob('deduped/*.bam'): outfile = '%(line)s_s_%(lane)s.intervals' % filename_re.search(infile).groupdict() yield [infile, 'intervals/%s' % outfile] @jobs_limit(1) @files(['sam/', 'sorted/'], None) @check_if_uptodate(check_if_clean) def clean_up(input_files, output_file): '''Clean up intermediate files''' print('Cleaning up intermeidate files: %s' % ', '.join(input_files)) call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False) # Find candidate intervals for realignment @follows(clean_up, mkdir('intervals', 'logs')) @files(create_intervals_generator) def create_intervals(input_file, output_file): '''Determine indel candidate intervals''' cmd_dict = CMD_DICT.copy() cmd_dict['infile'] = input_file cmd_dict['outfile'] = output_file pmsg('Interval Creation', input_file, output_file) gatk_cmd = '%(gatk)s --analysis_type RealignerTargetCreator ' + \ '--reference_sequence %(reference)s ' + \ '--DBSNP %(dbsnp)s ' + \ '--input_file %(infile)s ' + \ '--out %(outfile)s' call(gatk_cmd, cmd_dict) # Realign around possible indels
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # # 1 -> 2 -> 3 -> # -> 4 -> # 5 -> 6 # def do_write(file_name, what): with open(file_name, "a") as oo: oo.write(what) test_file = tempdir + "task.done" # # task1 # @originate([tempdir + d for d in ('a.1', 'b.1', 'c.1')]) @follows(mkdir(tempdir)) @posttask(lambda: do_write(test_file, "Task 1 Done\n")) def task1(outfile, *extra_params): """ First task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([None, outfile])) test_job_io(None, outfile, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([None, outfile])) # # task2 #
for sample, infiles in infiles_by_sample.items(): yield [infiles, 'indels/%s.indels_raw.vcf' % sample] @jobs_limit(1) @files(['fixmate/', 'intervals/', 'deduped/', 'realigned/'], None) @check_if_uptodate(check_if_clean) def clean_up(input_files, output_file): '''Clean up intermediate files from recalibration stage''' print('Cleaning up intermeidate files: %s' % ', '.join(input_files)) call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False) # Call SNPs @jobs_limit(1) @follows(clean_up, mkdir('snps', 'logs')) @merge('recalibrated/*.bam', 'snps/merged.snps_raw.vcf') def snp_genotyping(input_files, output_file): '''Call SNP variants''' pmsg('SNP Genotyping', ', '.join(input_files), output_file) cmd_dict = CMD_DICT.copy() cmd_dict['infiles'] = ' '.join( ['--input_file %s' % f for f in input_files]) cmd_dict['outfile'] = output_file gatk_cmd = '%(gatk)s ' + \ '--analysis_type UnifiedGenotyper ' + \ '--reference_sequence %(reference)s ' + \ '--DBSNP %(dbsnp)s ' + \ '--intervals %(target_interval)s ' + \ '--standard_min_confidence_threshold_for_calling 50 ' + \ '--standard_min_confidence_threshold_for_emitting 30 ' + \
def touch (filename): with open(filename, "w"): pass if sys.hexversion >= 0x03000000: unicode = str #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 directories = [os.path.abspath(unicode(tempdir + "a")), unicode(tempdir + "b")] @follows(mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e"))) @posttask(touch_file(unicode(tempdir + "f"))) def task_which_makes_directories (): pass @originate([tempdir + "g", tempdir + "h"]) def task_which_makes_files (o): touch(o) import unittest class Test_task_mkdir(unittest.TestCase): def setUp (self): """ """
with open(filename, "w"): pass if sys.hexversion >= 0x03000000: unicode = str # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 directories = [os.path.abspath(unicode(tempdir + "a")), unicode(tempdir + "b")] @follows(mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e"))) @posttask(touch_file(unicode(tempdir + "f"))) def task_which_makes_directories(): pass @originate([tempdir + "g", tempdir + "h"]) def task_which_makes_files(o): touch(o) class Test_task_mkdir(unittest.TestCase): def setUp(self): """
# Main logic # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # _________________________________________________________________________________________ # # setup_simulation_data # # _________________________________________________________________________________________ # # mkdir: makes sure output directories exist before task # @follows(mkdir(gene_data_dir, simulation_data_dir)) def setup_simulation_data(): """ create simulation files """ for i in range(CNT_GENE_GWAS_FILES): open(os.path.join(gene_data_dir, "%03d.gene" % i), "w").close() open(os.path.join(gene_data_dir, "%03d.gwas" % i), "w").close() # gene files without corresponding gwas and vice versa open(os.path.join(gene_data_dir, "orphan1.gene"), "w").close() open(os.path.join(gene_data_dir, "orphan2.gwas"), "w").close() open(os.path.join(gene_data_dir, "orphan3.gwas"), "w").close() for i in range(CNT_SIMULATION_FILES): open(os.path.join(simulation_data_dir, "%03d.simulation" % i),
sample = read_group_re.match(infile).groupdict()['sample'] infiles_by_sample[sample] = infiles_by_sample.get(sample, []) + [infile] for sample, infiles in infiles_by_sample.items(): yield [ infiles, 'indels/%s.indels_raw.vcf' % sample ] @jobs_limit(1) @files(['fixmate/', 'intervals/', 'deduped/', 'realigned/'], None) @check_if_uptodate(check_if_clean) def clean_up(input_files, output_file): '''Clean up intermediate files from recalibration stage''' print('Cleaning up intermeidate files: %s' % ', '.join(input_files)) call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False) # Call SNPs @jobs_limit(1) @follows(clean_up, mkdir('snps', 'logs')) @merge('recalibrated/*.bam', 'snps/merged.snps_raw.vcf') def snp_genotyping(input_files, output_file): '''Call SNP variants''' pmsg('SNP Genotyping', ', '.join(input_files), output_file) cmd_dict = CMD_DICT.copy() cmd_dict['infiles'] = ' '.join([ '--input_file %s' % f for f in input_files ]) cmd_dict['outfile'] = output_file gatk_cmd = '%(gatk)s ' + \ '--analysis_type UnifiedGenotyper ' + \ '--reference_sequence %(reference)s ' + \ '--DBSNP %(dbsnp)s ' + \ '--intervals %(target_interval)s ' + \ '--standard_min_confidence_threshold_for_calling 50 ' + \ '--standard_min_confidence_threshold_for_emitting 30 ' + \ '--annotation AlleleBalance ' + \
to_cluster = False statement = ''' rm -rf prereq_* ctmp*; rm -rf test_* _cache _static _templates _tmp report; rm -f *.log csvdb *.load *.tsv''' P.run() ################################################################### ################################################################### ################################################################### # primary targets ################################################################### @follows(mkdir("report")) def build_report(): '''build report from scratch.''' E.info("starting report build process from scratch") P.run_report(clean=True) @follows(mkdir("report")) def update_report(): '''update report.''' E.info("updating report") P.run_report(clean=False)
dbh = sqlite3.connect(PARAMS["database_name"]) return dbh def connectToUCSC(): return gtfsubset.connectToUCSC(host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=PARAMS["ucsc_database"]) ############################################################ # Assembly ############################################################ @follows(mkdir('assembly.dir')) @files(os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fasta"), PARAMS['interface_contigs']) def buildContigSizes(infile, outfile): ''' Get contig sizes from indexed genome :term:`fasta` files and outputs to a text file. Parameters ---------- infile : str infile is constructed from the `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str outfile is a text format file that contains two columns, matched
sys.path.insert(0, grandparent_dir) # module name = script name without extension module_name = os.path.splitext(os.path.basename(__file__))[0] # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 directories = [os.path.abspath(tempdir + 'a'), tempdir + 'b'] @follows(mkdir(tempdir), mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e')) def task_which_makes_directories(): pass class Test_task_mkdir(unittest.TestCase): def setUp(self): """ """ pass def tearDown(self): """ delete directories """
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 def touch(outfile): with open(outfile, "w"): pass # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 @follows(mkdir(tempdir)) @ruffus.files([[None, tempdir + "a.1"], [None, tempdir + "b.1"]]) def task1(i, o): touch(o) @follows(mkdir(tempdir)) @ruffus.files([[None, tempdir + "c.1"], [None, tempdir + "d.1"]]) def task2(i, o): touch(o) @transform(task1, regex(r"(.+)"), ruffus.inputs(((r"\1"), task2, "test_transform_inputs.*y")), r"\1.output") def task3(i, o):
output_text = "".join(sorted(output_text)) output_text += json.dumps(infiles) + " -> " + json.dumps(outfiles) + "\n" for f in outfile_names: with open(f, "w") as oo: oo.write(output_text) #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 @follows(mkdir(tempdir)) # # task1 # @files(None, tempdir + 'a.1') def task1(infiles, outfiles, *extra_params): """ First task """ test_job_io(infiles, outfiles, extra_params) # # task2 #
grandparent_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, grandparent_dir) # module name = script name without extension module_name = os.path.splitext(os.path.basename(__file__))[0] # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 directories = [os.path.abspath(tempdir + 'a'), tempdir + 'b'] @follows(mkdir(tempdir), mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e')) def task_which_makes_directories(): pass class Test_task_mkdir(unittest.TestCase): def setUp(self): """ """ pass def tearDown(self): """ delete directories """
# def do_write(file_name, what): with open(file_name, "a") as oo: oo.write(what) test_file = tempdir + "task.done" # # task1 # @originate([tempdir + d for d in ('a.1', 'b.1', 'c.1')]) @follows(mkdir(tempdir)) @posttask(lambda: do_write(test_file, "Task 1 Done\n")) def task1(outfile, *extra_params): """ First task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([None, outfile])) test_job_io(None, outfile, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([None, outfile])) # # task2 #
pass # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f ) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): """Make a single fasta file for each sample of all contaminant adaptor sequences for removal """ PipelinePreprocess.makeAdaptorFasta( infile=infile, outfile=outfile, track=re.match(REGEX_TRACK, infile).groups()[0], dbh=connect(), contaminants_file=PARAMS["contaminants"], ) @merge(makeAdaptorFasta, "contaminants.fasta") def aggregateAdaptors(infiles, outfile):
def circos_files(): link_path = os.path.join(DATA_DIR, 'LinkageResults') align_path = os.path.join(DATA_DIR, 'LANLSequences', 'Alignments') ifiles = [] for f in os.listdir(link_path): if f.endswith('.res'): ifiles.append(os.path.join(link_path, f)) for f in os.listdir(align_path): if f.endswith('.aln'): ifiles.append(os.path.join(align_path, f)) yield ifiles, None @ruffus.follows(ruffus.mkdir(os.path.join(DATA_DIR, 'CircosFigs'))) @ruffus.files(circos_files) def circos_figs(ifile, ofile): dump_path = os.path.join(DATA_DIR, 'CircosFigs') load_path = os.path.join(DATA_DIR, 'LinkageResults') align_path = os.path.join(DATA_DIR, 'LANLSequences', 'Alignments') prots = set() for f in os.listdir(align_path): if f.endswith(f): prots.add(f) prots.add('All') lcuts = [0.5,0.6,0.7,0.8,0.9] graph = CircosGraph.load_from_dir(load_path, align_path) sort_fun = lambda x: x['Score'] for lcut, prot in product(lcuts, prots):
import sh import ruffus import os import random import pyprind import gzip import simplejson as json data_dir = os.environ['DATA'] words_dir = os.path.join(data_dir, "words") # /usr/share/dict/words is a text file full of words on most unix systems @ruffus.follows(ruffus.mkdir(words_dir)) @ruffus.originate(os.path.join(words_dir, "words.txt")) def get_words(output_file): sh.cp("/usr/share/dict/words", output_file) sh.chmod("u+w", output_file) @ruffus.transform(get_words, ruffus.suffix(".txt"), ".alphabet.json") def build_alphabet_dictionary(input_file, output_file): characters = set() with open(input_file) as f: for line in f: characters = characters.union(line.rstrip()) alphabet = list(sorted(characters)) + ['PADDING', 'START', 'END'] with open(output_file, 'w') as f:
def estimate_process(): """Estimate number of processes based on the maximum size of fastq file.""" size = max(SIZES) / (1000 * 1000 * 1000) * 4 n = int(options.memory / size) if n == 0: n = 1 elif n > options.cores: n = options.cores return n PROCESS = estimate_process() @ruffus.follows(ruffus.mkdir('fastq_to_bam')) @ruffus.originate(list(FASTQS.keys())) def soft_link(link): """ Create soft links for original fastq files. """ def make_link(path, link): if path: if path == os.path.abspath(link): message = "No symbolic link was made for {path}! You are directly working on the original file!" logger.warning(message) else: if not os.path.exists(link): message = f'Soft link fastq: {os.path.basename(path)} ...' cmding(f'ln -s {path} {link}', message=message) link1, link2 = link, link.replace('.r1.fastq.gz', '.r2.fastq.gz') make_link(FASTQS[link]['fastq1'], link1)
"*.fastq.2.gz", "*.fastq.gz") FASTQ_DIR = PARAMS['fastq_dir'] # set to value for testing purposes (see regexes below) if FASTQ_DIR == "?!": FASTQ_DIR = "" FASTQ_FILES = tuple([os.path.join(FASTQ_DIR, suffix_name) for suffix_name in FASTQ_SUFFIXES]) FASTQ_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.1.gz")) FASTQ_PAIR = os.path.join(FASTQ_DIR, r"\1.fastq.2.gz") SE_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.gz")) GENESETS = [y for y in glob.glob(os.path.join("reference.dir/*.gtf.gz"))] @follows(mkdir("transcripts.dir")) @transform("%s" % PARAMS['annotations_geneset_gtf'], regex("reference.dir/(.+).gtf.gz"), r"transcripts.dir/\1.fa") def makeRepTranscripts(infile, outfile): ''' make a single representative transcript for each gene - put into a multi-fasta file ''' genome_file = "/".join([PARAMS['genome_dir'], PARAMS['genome']]) statement = ''' zcat %(infile)s | cgat gff2fasta --genome-file=%(genome_file)s
statement = ''' rm -rf prereq_* ctmp*; rm -rf test_* _cache _static _templates _tmp report; rm -f *.log csvdb *.load *.tsv''' P.run(statement) ################################################################### ################################################################### ################################################################### # primary targets ################################################################### @follows(mkdir("report")) def build_report(): '''build report from scratch.''' E.info("starting report build process from scratch") run_report(clean=True) @follows(mkdir("report")) def update_report(): '''update report.''' E.info("updating report") run_report(clean=False)
@active_if("iclip_transcript_region_metagene" in PARAMS["methods"]) @merge(do_iclip_metagene, "transcript_regions.load") def merge_and_load_region_metagenes(infiles, outfile): P.concatenate_and_load( infiles, outfile, regex_filename=".+/(.+)-(.+)-(.+)\.vs\.(.+).tsv.gz", cat="source,condition,replicate,geneset", options=" -i source -i condition -i replicate -i geneset") # ------------------------------------------------------------------------- @follows(mkdir(os.path.join(PARAMS["export"], "images"))) @split(merge_and_load_metagenes, os.path.join(PARAMS["export"], "images/*.%s" % PARAMS["plotting"]["format"])) def do_plots(infile, outfiles): code_location = os.path.dirname(__file__) script_file = os.path.join(code_location, "plot_metagenes.R") statement = "Rscript %(script_file)s" P.run(statement) # --------------------------------------------------- # Generic pipeline tasks @follows(do_plots, merge_and_load_region_metagenes)
cc.close() return dbh # Determine whether the gemone is paired SPLICED_MAPPING = PARAMS["bam_paired_end"] ######################################################################### # Count reads as some QC targets require it ######################################################################### @follows(mkdir("nreads.dir")) @transform("*.bam", suffix(".bam"), r"nreads.dir/\1.nreads") def countReads(infile, outfile): '''Count number of reads in input files.''' statement = '''printf "nreads \\t" >> %(outfile)s''' P.run(statement) statement = '''samtools view %(infile)s | wc -l | xargs printf >> %(outfile)s''' P.run(statement) #########################################################################
# if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal ''' PipelinePreprocess.makeAdaptorFasta( infile=infile, outfile=outfile, track=re.match(REGEX_TRACK, infile).groups()[0], dbh=connect(), contaminants_file=PARAMS['contaminants']) @merge(makeAdaptorFasta, "contaminants.fasta")
"""dummy task - no processing of reads.""" # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if PARAMS.get("preprocessors", None): if PARAMS["auto_remove"]: # check if fastqc has been run for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]): f = re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError("file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal ''' print(infile) print(REGEX_TRACK) PipelinePreprocess.makeAdaptorFasta( infile=infile, outfile=outfile, track=re.match(REGEX_TRACK, infile).groups()[0], dbh=connect(),
# if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if P.get_params().get("preprocessors", None): if P.get_params()["auto_remove"]: # check if FastQC has been run for x in iotools.flatten([glob.glob(y) for y in P.get_params()["input_globs"].get("default", INPUT_FORMATS)]): f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal ''' preprocess.makeAdaptorFasta( infile=infile, outfile=outfile, track=re.match(REGEX_TRACK, infile).groups()[0], dbh=connect(), contaminants_file=P.get_params()['contaminants_path'])
# Main logic # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # _________________________________________________________________________________________ # # setup_simulation_data # # _________________________________________________________________________________________ # # mkdir: makes sure output directories exist before task # @follows(mkdir(gene_data_dir, simulation_data_dir)) def setup_simulation_data(): """ create simulation files """ for i in range(CNT_GENE_GWAS_FILES): open(os.path.join(gene_data_dir, "%03d.gene" % i), "w").close() open(os.path.join(gene_data_dir, "%03d.gwas" % i), "w").close() # gene files without corresponding gwas and vice versa open(os.path.join(gene_data_dir, "orphan1.gene"), "w").close() open(os.path.join(gene_data_dir, "orphan2.gwas"), "w").close() open(os.path.join(gene_data_dir, "orphan3.gwas"), "w").close() for i in range(CNT_SIMULATION_FILES): open(os.path.join(simulation_data_dir, "%03d.simulation" % i), "w").close()
oo.write(output_text + file_output_text) # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 pipeline_active_if = True # # task1 # @follows(mkdir("test_active_if")) @originate(['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter") def task1(outfile, extra): """ First task """ # N.B. originate works with an extra parameter helper(None, outfile) # # task2 # @transform(task1, suffix(".1"), ".2") def task2(infile, outfile): """
with open(outfile, "w") as oo: oo.write(output_text + file_output_text) # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 pipeline_active_if = True # # task1 # @follows(mkdir("test_active_if")) @originate(['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter") def task1(outfile, extra): """ First task """ # N.B. originate works with an extra parameter helper(None, outfile) # # task2 # @transform(task1, suffix(".1"), ".2") def task2(infile, outfile): """
import sh import ruffus import os import random import simplejson as json import pyprind import gzip data_dir = os.environ['DATA'] text8_dir = os.path.join(data_dir, "text8") N_TRAIN_CHAR_FRAGMENTS = 100000 CHAR_FRAGMENTS_CONTEXT_LENGTH = 50 @ruffus.follows(ruffus.mkdir(text8_dir)) @ruffus.originate(os.path.join(text8_dir, "text8.zip")) def download_text8(output_file): sh.wget("-O", output_file, "http://mattmahoney.net/dc/text8.zip") @ruffus.transform(download_text8, ruffus.suffix(".zip"), ".txt") def extract_text8(input_file, output_file): sh.cd(text8_dir) sh.unzip(input_file) print sh.ls() sh.mv("text8", output_file) @ruffus.transform(extract_text8, ruffus.suffix(".txt"), ".alphabet.json") def build_alpabet_dictionary(input_file, output_file): characters = set() with open(input_file) as f: