コード例 #1
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2,
                                     "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
コード例 #2
0
    def test_newstyle_mkdir (self):
        test_pipeline = Pipeline("test")
        test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e'))
        test_pipeline.run(multiprocess = 10, verbose = 0)

        for d in 'abcde':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
コード例 #3
0
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
コード例 #4
0
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func=task_m_to_1,
                            name="add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input=task_originate,
                            # requires an anchor from 3.7 onwards, see
                            # https://bugs.python.org/issue34982
                            filter=regex(r"^(.*)"),
                            add_inputs=add_inputs(
                                tempdir + "/testdir/whatever.txt"),
                            output=r"\1.22")
    test_pipeline.transform(task_func=task_1_to_1,
                            name="22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input=output_from("add_input"),
                            filter=suffix(".22"),
                            output=".33")
    tail_task = test_pipeline.transform(task_func=task_1_to_1,
                                        name="33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input=test_pipeline["22_to_33"],
                                        filter=suffix(".33"),
                                        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
コード例 #5
0
    def test_newstyle_mkdir(self):
        test_pipeline = Pipeline("test")
        test_pipeline.follows(task_which_makes_directories, mkdir(directories),
                              mkdir(tempdir + 'c'),
                              mkdir(tempdir + 'd', tempdir + 'e'),
                              mkdir(tempdir + 'e'))
        test_pipeline.run(multiprocess=10, verbose=0)

        for d in 'abcde':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
コード例 #6
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func=step_5_calculate_sum_of_squares,
                                input=step_4_split_numbers_into_chunks,
                                filter=suffix(".chunks"),
                                output=".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess=50, verbose=0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists(output_file):
            raise Exception("Missing %s" % output_file)
コード例 #7
0
ファイル: test_tutorial7.py プロジェクト: Genomicsplc/ruffus
    def test_newstyle_ruffus (self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func = step_5_calculate_sum_of_squares,
                           input = step_4_split_numbers_into_chunks,
                           filter = suffix(".chunks"),
                           output = ".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists (output_file):
            raise Exception("Missing %s" % output_file)
コード例 #8
0
    def test_newstyle_mkdir (self):
        test_pipeline = Pipeline("test")

        test_pipeline.follows(task_which_makes_directories,
                         mkdir(directories),
                         mkdir(unicode(tempdir + "c")),
                         mkdir(unicode(tempdir + "d"),
                               unicode(tempdir + "e")),
                         mkdir(unicode(tempdir + "e")))\
            .posttask(touch_file(unicode(tempdir + "f")))

        test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"])
        test_pipeline.run(multiprocess = 10, verbose = 0)

        for d in 'abcdefgh':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
コード例 #9
0
    def test_newstyle_mkdir(self):
        test_pipeline = Pipeline("test")

        test_pipeline.follows(task_which_makes_directories,
                              mkdir(directories),
                              mkdir(unicode(tempdir + "c")),
                              mkdir(unicode(tempdir + "d"),
                                    unicode(tempdir + "e")),
                              mkdir(unicode(tempdir + "e")))\
            .posttask(touch_file(unicode(tempdir + "f")))

        test_pipeline.originate(task_which_makes_files,
                                [tempdir + "g", tempdir + "h"])
        test_pipeline.run(multiprocess=10, verbose=0)

        for d in 'abcdefgh':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
コード例 #10
0
    def test_newstyle_ruffus (self):

        test_pipeline = Pipeline("test")

        test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir))

        test_pipeline.files(gwas_simulation, generate_simulation_params)\
            .follows(setup_simulation_data)\
            .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results")))

        test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\
            .posttask(lambda : sys.stdout.write("\nOK\n"))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        for oo in "000.mean", "001.mean":
            results_file_name = os.path.join(working_dir, oo)
            if not os.path.exists(results_file_name):
                raise Exception("Missing %s" % results_file_name)
コード例 #11
0
ファイル: test_N_x_M_and_collate.py プロジェクト: xnox/ruffus
    def test_newstyle_ruffus(self):

        test_pipeline = Pipeline("test")

        test_pipeline.follows(setup_simulation_data,
                              mkdir(gene_data_dir, simulation_data_dir))

        test_pipeline.files(gwas_simulation, generate_simulation_params)\
            .follows(setup_simulation_data)\
            .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results")))

        test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\
            .posttask(lambda: sys.stdout.write("\nOK\n"))

        test_pipeline.run(multiprocess=50, verbose=0)
        for oo in "000.mean", "001.mean":
            results_file_name = os.path.join(working_dir, oo)
            if not os.path.exists(results_file_name):
                raise Exception("Missing %s" % results_file_name)
コード例 #12
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2, "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
コード例 #13
0
 def test_newstyle_task(self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func=task2,
                             input=task1,
                             filter=regex(r".*"),
                             output=tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess=10, verbose=0)
コード例 #14
0
 def test_newstyle_task (self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func   = task2,
                             input       = task1,
                             filter      = regex(r".*"),
                             output      = tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess = 10, verbose = 0)
コード例 #15
0
ファイル: test_collate.py プロジェクト: xnox/ruffus
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")
        test_pipeline.split(task_func=prepare_files,
                            input=None,
                            output=tempdir + '*.animal')\
            .follows(mkdir(tempdir, tempdir + "test"))\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))

        test_pipeline.collate(task_func=summarise_by_grouping,
                              input=prepare_files,
                              filter=regex(r'(.*/).*\.(.*)\.animal'),
                              output=r'\1\2.results')\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n"))

        test_pipeline.run(multiprocess=10, verbose=0)
        check_species_correct()
コード例 #16
0
ファイル: test_collate.py プロジェクト: bunbun/ruffus
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")
        test_pipeline.split(task_func=prepare_files,
                            input=None,
                            output=tempdir + '*.animal')\
            .follows(mkdir(tempdir, tempdir + "test"))\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))

        test_pipeline.collate(task_func=summarise_by_grouping,
                              input=prepare_files,
                              filter=regex(r'(.*/).*\.(.*)\.animal'),
                              output=r'\1\2.results')\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n"))

        test_pipeline.run(multiprocess=10, verbose=0)
        check_species_correct()
コード例 #17
0
def quality_boxplot(in_stats, out_boxplot):
    """draw a boxplot for the quality scores"""
    cmd = 'fastq_quality_boxplot_graph.sh -t %s -i %s -o %s' % (in_stats,
                                                in_stats, out_boxplot)
    check_call(cmd)


@transform(quality_stats, suffix('.qual_stats'), '.qual_nuc_dist.png')
def quality_nuc_dist(in_stats, out_dist):
    'show the nucleotide distribution across the reads'
    cmd = 'fastx_nucleotide_distribution_graph.sh -t %s -i %s -o %s' % (
        in_stats, in_stats, out_dist)
    check_call(cmd)


@follows(mkdir('summaries'))
@merge([original_reads, clip_adapter, trim_reads, trim_regex,
        filter_artifacts, filter_min_quality],
    #join('..', 'summaries', 'fastq.wikisummary'))
    'fastq.wikisummary')
def summarize_fastq_reads(in_fastq, out_summary):
    """Summarize fastq line counts"""
    with open(out_summary, 'w') as outfile:
        outfile.write("""
{| class="wikitable"
|+ Summary of raw read counts
!scope="col" | Dataset
!scope="col" | Number of raw reads
|-
""")
        for infile in in_fastq:
コード例 #18
0
import sh
import ruffus
import os
import random
import pyprind
import gzip
import simplejson as json

data_dir = os.environ['DATA']
words_dir = os.path.join(data_dir, "words")

# /usr/share/dict/words is a text file full of words on most unix systems


@ruffus.follows(ruffus.mkdir(words_dir))
@ruffus.originate(os.path.join(words_dir, "words.txt"))
def get_words(output_file):
    sh.cp("/usr/share/dict/words", output_file)
    sh.chmod("u+w", output_file)


@ruffus.transform(get_words, ruffus.suffix(".txt"), ".alphabet.json")
def build_alphabet_dictionary(input_file, output_file):
    characters = set()
    with open(input_file) as f:
        for line in f:
            characters = characters.union(line.rstrip())

    alphabet = list(sorted(characters)) + ['PADDING', 'START', 'END']
コード例 #19
0
    dbh = sqlite3.connect(PARAMS["database_name"])
    return dbh


def connectToUCSC():
    return PipelineGtfsubset.connectToUCSC(
        host=PARAMS["ucsc_host"],
        user=PARAMS["ucsc_user"],
        database=PARAMS["ucsc_database"])

############################################################
# Assembly
############################################################


@follows(mkdir('assembly.dir'))
@files(os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fasta"),
       PARAMS['interface_contigs'])
def buildContigSizes(infile, outfile):
    '''
    Get contig sizes from indexed genome :term:`fasta` files and
    outputs to a text file.
    Parameters
    ----------
    infile : str
      infile is constructed from the `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      outfile is a text format file that contains two columns, matched
コード例 #20
0
        outfile = '%(line)s_s_%(lane)s.intervals' % filename_re.search(
            infile).groupdict()
        yield [infile, 'intervals/%s' % outfile]


@jobs_limit(1)
@files(['sam/', 'sorted/'], None)
@check_if_uptodate(check_if_clean)
def clean_up(input_files, output_file):
    '''Clean up intermediate files'''
    print('Cleaning up intermeidate files: %s' % ', '.join(input_files))
    call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False)


# Find candidate intervals for realignment
@follows(clean_up, mkdir('intervals', 'logs'))
@files(create_intervals_generator)
def create_intervals(input_file, output_file):
    '''Determine indel candidate intervals'''
    cmd_dict = CMD_DICT.copy()
    cmd_dict['infile'] = input_file
    cmd_dict['outfile'] = output_file
    pmsg('Interval Creation', input_file, output_file)
    gatk_cmd = '%(gatk)s --analysis_type RealignerTargetCreator ' + \
            '--reference_sequence %(reference)s ' + \
            '--DBSNP %(dbsnp)s ' + \
            '--input_file %(infile)s ' + \
            '--out %(outfile)s'
    call(gatk_cmd, cmd_dict)

コード例 #21
0

# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
                    "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads,
                   regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''

            print(infile)
            print(REGEX_TRACK)

            PipelinePreprocess.makeAdaptorFasta(
                infile=infile,
                outfile=outfile,
                track=re.match(REGEX_TRACK, infile).groups()[0],
コード例 #22
0
for key in P.PARAMS:
    if is_none(P.PARAMS[key]):
        P.PARAMS[key] = None
    elif is_on(P.PARAMS):
        P.PARAMS[key] = True

# Global variables
CREATE_BIGWIGS = P.PARAMS.get("run_options_bigwigs")
CREATE_HUB = P.PARAMS.get("run_options_hub")

#############
# Pipeline  #
#############


@follows(mkdir("statistics"), mkdir("statistics/fastqc"))
@transform("*.fastq.gz", regex(r"(.*).fastq.gz"), r"statistics/fastqc/\1_fastqc.zip")
def qc_reads(infile, outfile):
    """Quality control of raw sequencing reads"""

    statement = "fastqc -q -t %(pipeline_n_cores)s --nogroup %(infile)s --outdir statistics/fastqc"

    P.run(
        statement,
        job_queue=P.PARAMS["pipeline_cluster_queue"],
        job_threads=P.PARAMS["pipeline_n_cores"],
        job_condaenv=P.PARAMS["conda_env"],
    )


@merge(qc_reads, "statistics/readqc_report.html")
コード例 #23
0
        COLOR_OCTOMAP_FILE, COLOR_OCTOMAP_BT, MERGED_CLOUD_FILE,\
        CAST_OCTOMAP_SINGLE, MERGED_VTK_FILE, STATIC_CLOUD_FILE,\
        STATIC_VTK_FILE, DYNAMIC_CLOUD_FILE, DYNAMIC_VTK_FILE,\
        FILTERED_CLOUDS_DIR, PARAMS_TO_LOAD,\
        MERGED_COLOR_CLOUDS_DIR, MERGED_COLOR_CLOUD_FILE,\
        MERGED_COLOR_VTK_FILE, LDR_UPSAMPLED_DIR, LDR_DIR,\
        NO_TRANSFORM, CAMERA
from pipeline_utils import file_num

dirs = [
    LDR_DIR, LDR_UPSAMPLED_DIR, POINTS_H5_DIR, PCD_DIR, PCD_DOWNSAMPLED_DIR,
    PCD_DOWNSAMPLED_NORMALS_DIR, ICP_TRANSFORMS_DIR, COLOR_DIR,
    COLOR_CLOUDS_DIR, MERGED_CLOUDS_DIR, MERGED_COLOR_CLOUDS_DIR, OCTOMAP_DIR,
    COLOR_OCTOMAP_DIR, FILTERED_CLOUDS_DIR
]
MKDIRS = [mkdir(d) for d in dirs]

# NOTE chdir into dset dir so can just specify relative paths to data
os.chdir(DSET_DIR)

DOWNLOADS = list()
for f in REMOTE_FILES:
    DOWNLOADS.append([None, f])


@follows(*MKDIRS)
@files(DOWNLOADS)
def download_files(dummy, local_file):
    cmd = 'rsync -vr --ignore-existing %s/%s .' % (REMOTE_DATA_DIR, local_file)
    print cmd
    check_call(cmd, shell=True)
コード例 #24
0
    if not os.path.exists(PARAMS["annotations_database"]):
        raise ValueError(
            "can't find database '%s'" %
            PARAMS["annotations_database"])

    statement = '''ATTACH DATABASE '%s' as annotations''' % \
                (PARAMS["annotations_database"])

    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()

    return dbh


@follows(mkdir("geneset.dir"))
@merge(PARAMS["annotations_interface_geneset_all_gtf"],
       "geneset.dir/reference.gtf.gz")
def buildReferenceGeneSet(infile, outfile):
    ''' filter full gene set and add attributes to create the reference gene set

    Performs merge and filter operations:
       * Merge exons separated by small introns (< 5bp).
       * Remove transcripts with very long introns (`max_intron_size`)
       * Remove transcripts located on contigs to be ignored (`remove_contigs`)
         (usually: chrM, _random, ...)
       * (Optional) Remove transcripts overlapping repetitive sequences
         (`rna_file`)

    This preserves all features in a gtf file (exon, CDS, ...)
コード例 #25
0
from bz2 import BZ2File
from glob import iglob, glob

from ruffus import follows, files, inputs, merge, mkdir, regex, transform

from ..utils import CMD_DICT, call, pmsg, read_group_re

def copy_sequence_generator():
    for in_file in iglob('staging_area/*'):
        out_file = os.path.split(in_file)[-1]
        out_file = out_file.split(os.path.extsep)[0] + '.fastq.gz'
        out_file = os.path.join('fastq', out_file)
        yield [in_file, out_file]

# Copy sequence from staging area
@follows(mkdir('fastq', 'logs'))
@files(copy_sequence_generator)
def copy_sequence(input_file, output_file):
    '''Copy sequence files from staging area'''
    GZIP_HEADER = '\x1f\x8b'
    BZIP_HEADER = 'BZ'

    pmsg('Copying sequence files', input_file, output_file)
    # check if this is actually a gzipped file
    header = open(input_file).read(2)
    if header == GZIP_HEADER:
        input_file_handle = gzip.open(input_file, 'rb')
    elif header == BZIP_HEADER:
        input_file_handle = BZ2File(input_file, 'r')
    else:
        input_file_handle = open(input_file, 'rb')
コード例 #26
0
ファイル: test_collate.py プロジェクト: xnox/ruffus
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#
#    task1
#


def do_write(file_name, what):
    with open(file_name, "a") as oo:
        oo.write(what)


test_file = tempdir + "task.done"


@follows(mkdir(tempdir, tempdir + "test"))
@posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))
@split(None, tempdir + '*.animal')
def prepare_files(no_inputs, outputs):
    # cleanup previous
    for f in outputs:
        os.unlink(f)

    for grouping in species_list:
        for species_name in species_list[grouping]:
            filename = tempdir + "%s.%s.animal" % (species_name, grouping)
            with open(filename, "w") as oo:
                oo.write(species_name + "\n")


#
コード例 #27
0
from ruffus import follows, files, inputs, merge, mkdir, regex, transform

from ..utils import CMD_DICT, call, pmsg, read_group_re


def copy_sequence_generator():
    for in_file in iglob('staging_area/*'):
        out_file = os.path.split(in_file)[-1]
        out_file = out_file.split(os.path.extsep)[0] + '.fastq.gz'
        out_file = os.path.join('fastq', out_file)
        yield [in_file, out_file]


# Copy sequence from staging area
@follows(mkdir('fastq', 'logs'))
@files(copy_sequence_generator)
def copy_sequence(input_file, output_file):
    '''Copy sequence files from staging area'''
    GZIP_HEADER = '\x1f\x8b'
    BZIP_HEADER = 'BZ'

    pmsg('Copying sequence files', input_file, output_file)
    # check if this is actually a gzipped file
    header = open(input_file).read(2)
    if header == GZIP_HEADER:
        input_file_handle = gzip.open(input_file, 'rb')
    elif header == BZIP_HEADER:
        input_file_handle = BZ2File(input_file, 'r')
    else:
        input_file_handle = open(input_file, 'rb')
コード例 #28
0
ファイル: mapping.py プロジェクト: kdaily/HTS-waterworks
from ruffus.task import active_if
from pygr import worldbase, cnestedlist, seqdb
import pybedtools

from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log,
                                           main_mutex as log_mtx)
from hts_waterworks.bootstrap import (genome_path, get_genome, cfg,
                                      get_chrom_sizes)
import hts_waterworks.preprocessing as preprocessing

#: the references to map against for this run (genome, transcriptome, etc)
reference_genomes = [genome_path()]
if cfg.getboolean('mapping', 'map_to_transcriptome'):
    reference_genomes.append('*_genes.transcriptome.fasta')

@follows(mkdir('mapped'))
def make_mapping_dir():
    pass


@active_if(cfg.getboolean('mapping', 'map_to_transcriptome'))
@split('*_genes', regex(r'(.*)_genes$'),
       [r'\1_genes.transcriptome.fasta',
        r'\1_genes.transcriptome.seqdb',
        r'\1_genes.transcriptome.msa'])
def make_transcriptome(in_genes, out_files):
    """Splice UTR's and exons from gene annotations into a transcriptome.
    Creates a fasta-file of resulting genes and a gene to genome alignment.
    
    """
    out_fasta, out_db, out_msa = out_files
コード例 #29
0
ファイル: pipeline.py プロジェクト: brodyh/sail-car-log
        COLOR_OCTOMAP_DIR, OCTOMAP_FILE,\
        COLOR_OCTOMAP_FILE, COLOR_OCTOMAP_BT, MERGED_CLOUD_FILE,\
        CAST_OCTOMAP_SINGLE, MERGED_VTK_FILE, STATIC_CLOUD_FILE,\
        STATIC_VTK_FILE, DYNAMIC_CLOUD_FILE, DYNAMIC_VTK_FILE,\
        FILTERED_CLOUDS_DIR, PARAMS_TO_LOAD,\
        MERGED_COLOR_CLOUDS_DIR, MERGED_COLOR_CLOUD_FILE,\
        MERGED_COLOR_VTK_FILE, LDR_UPSAMPLED_DIR, LDR_DIR,\
        NO_TRANSFORM, CAMERA
from pipeline_utils import file_num


dirs = [LDR_DIR, LDR_UPSAMPLED_DIR, POINTS_H5_DIR, PCD_DIR, PCD_DOWNSAMPLED_DIR,
        PCD_DOWNSAMPLED_NORMALS_DIR, ICP_TRANSFORMS_DIR, COLOR_DIR,
        COLOR_CLOUDS_DIR, MERGED_CLOUDS_DIR, MERGED_COLOR_CLOUDS_DIR,
        OCTOMAP_DIR, COLOR_OCTOMAP_DIR, FILTERED_CLOUDS_DIR]
MKDIRS = [mkdir(d) for d in dirs]

# NOTE chdir into dset dir so can just specify relative paths to data
os.chdir(DSET_DIR)

DOWNLOADS = list()
for f in REMOTE_FILES:
    DOWNLOADS.append([None, f])
@follows(*MKDIRS)
@files(DOWNLOADS)
def download_files(dummy, local_file):
    cmd = 'rsync -vr --ignore-existing %s/%s .' % (REMOTE_DATA_DIR, local_file)
    print cmd
    check_call(cmd, shell=True)

@follows('download_files')
コード例 #30
0
def create_intervals_generator():
    for infile in glob('deduped/*.bam'):
        outfile = '%(line)s_s_%(lane)s.intervals' % filename_re.search(infile).groupdict()
        yield [infile, 'intervals/%s' % outfile]

@jobs_limit(1)
@files(['sam/', 'sorted/'], None)
@check_if_uptodate(check_if_clean)
def clean_up(input_files, output_file):
    '''Clean up intermediate files'''
    print('Cleaning up intermeidate files: %s' % ', '.join(input_files))
    call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False)

# Find candidate intervals for realignment
@follows(clean_up, mkdir('intervals', 'logs'))
@files(create_intervals_generator)
def create_intervals(input_file, output_file):
    '''Determine indel candidate intervals'''
    cmd_dict = CMD_DICT.copy()
    cmd_dict['infile'] = input_file
    cmd_dict['outfile'] = output_file
    pmsg('Interval Creation', input_file, output_file)
    gatk_cmd = '%(gatk)s --analysis_type RealignerTargetCreator ' + \
            '--reference_sequence %(reference)s ' + \
            '--DBSNP %(dbsnp)s ' + \
            '--input_file %(infile)s ' + \
            '--out %(outfile)s'
    call(gatk_cmd, cmd_dict)

# Realign around possible indels
コード例 #31
0
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#
#   1   ->  2   ->  3   ->
#       ->  4           ->
#                   5   ->    6
#

def do_write(file_name, what):
    with open(file_name, "a") as oo:
        oo.write(what)
test_file = tempdir + "task.done"
#
#    task1
#
@originate([tempdir + d for d in ('a.1', 'b.1', 'c.1')])
@follows(mkdir(tempdir))
@posttask(lambda: do_write(test_file, "Task 1 Done\n"))
def task1(outfile, *extra_params):
    """
    First task
    """
    with open(tempdir + "jobs.start",  "a") as oo:
        oo.write('job = %s\n' % json.dumps([None, outfile]))
    test_job_io(None, outfile, extra_params)
    with open(tempdir + "jobs.finish",  "a") as oo:
        oo.write('job = %s\n' % json.dumps([None, outfile]))


#
#    task2
#
コード例 #32
0
    for sample, infiles in infiles_by_sample.items():
        yield [infiles, 'indels/%s.indels_raw.vcf' % sample]


@jobs_limit(1)
@files(['fixmate/', 'intervals/', 'deduped/', 'realigned/'], None)
@check_if_uptodate(check_if_clean)
def clean_up(input_files, output_file):
    '''Clean up intermediate files from recalibration stage'''
    print('Cleaning up intermeidate files: %s' % ', '.join(input_files))
    call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False)


# Call SNPs
@jobs_limit(1)
@follows(clean_up, mkdir('snps', 'logs'))
@merge('recalibrated/*.bam', 'snps/merged.snps_raw.vcf')
def snp_genotyping(input_files, output_file):
    '''Call SNP variants'''
    pmsg('SNP Genotyping', ', '.join(input_files), output_file)
    cmd_dict = CMD_DICT.copy()
    cmd_dict['infiles'] = ' '.join(
        ['--input_file %s' % f for f in input_files])
    cmd_dict['outfile'] = output_file
    gatk_cmd = '%(gatk)s ' + \
            '--analysis_type UnifiedGenotyper ' + \
            '--reference_sequence %(reference)s ' + \
            '--DBSNP %(dbsnp)s ' + \
            '--intervals %(target_interval)s ' + \
            '--standard_min_confidence_threshold_for_calling 50 ' + \
            '--standard_min_confidence_threshold_for_emitting 30 ' + \
コード例 #33
0
def touch (filename):
    with open(filename, "w"):
        pass

if sys.hexversion >= 0x03000000:
    unicode = str

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
directories = [os.path.abspath(unicode(tempdir + "a")), unicode(tempdir + "b")]
@follows(mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e")))
@posttask(touch_file(unicode(tempdir + "f")))
def task_which_makes_directories ():
    pass

@originate([tempdir + "g", tempdir + "h"])
def task_which_makes_files (o):
        touch(o)

import unittest

class Test_task_mkdir(unittest.TestCase):

    def setUp (self):
        """
        """
コード例 #34
0
    with open(filename, "w"):
        pass


if sys.hexversion >= 0x03000000:
    unicode = str

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
directories = [os.path.abspath(unicode(tempdir + "a")), unicode(tempdir + "b")]


@follows(mkdir(directories), mkdir(unicode(tempdir + "c")),
         mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")),
         mkdir(unicode(tempdir + "e")))
@posttask(touch_file(unicode(tempdir + "f")))
def task_which_makes_directories():
    pass


@originate([tempdir + "g", tempdir + "h"])
def task_which_makes_files(o):
    touch(o)


class Test_task_mkdir(unittest.TestCase):
    def setUp(self):
        """
コード例 #35
0
ファイル: test_N_x_M_and_collate.py プロジェクト: xnox/ruffus
#   Main logic

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

# _________________________________________________________________________________________
#
#   setup_simulation_data
#
# _________________________________________________________________________________________


#
# mkdir: makes sure output directories exist before task
#
@follows(mkdir(gene_data_dir, simulation_data_dir))
def setup_simulation_data():
    """
    create simulation files
    """
    for i in range(CNT_GENE_GWAS_FILES):
        open(os.path.join(gene_data_dir, "%03d.gene" % i), "w").close()
        open(os.path.join(gene_data_dir, "%03d.gwas" % i), "w").close()

    # gene files without corresponding gwas and vice versa
    open(os.path.join(gene_data_dir, "orphan1.gene"), "w").close()
    open(os.path.join(gene_data_dir, "orphan2.gwas"), "w").close()
    open(os.path.join(gene_data_dir, "orphan3.gwas"), "w").close()

    for i in range(CNT_SIMULATION_FILES):
        open(os.path.join(simulation_data_dir, "%03d.simulation" % i),
コード例 #36
0
        sample = read_group_re.match(infile).groupdict()['sample']
        infiles_by_sample[sample] = infiles_by_sample.get(sample, []) + [infile]
    for sample, infiles in infiles_by_sample.items():
        yield [ infiles, 'indels/%s.indels_raw.vcf' % sample ]

@jobs_limit(1)
@files(['fixmate/', 'intervals/', 'deduped/', 'realigned/'], None)
@check_if_uptodate(check_if_clean)
def clean_up(input_files, output_file):
    '''Clean up intermediate files from recalibration stage'''
    print('Cleaning up intermeidate files: %s' % ', '.join(input_files))
    call('rm -rf %s' % ' '.join(input_files), {}, is_logged=False)

# Call SNPs
@jobs_limit(1)
@follows(clean_up, mkdir('snps', 'logs'))
@merge('recalibrated/*.bam', 'snps/merged.snps_raw.vcf')
def snp_genotyping(input_files, output_file):
    '''Call SNP variants'''
    pmsg('SNP Genotyping', ', '.join(input_files), output_file)
    cmd_dict = CMD_DICT.copy()
    cmd_dict['infiles'] = ' '.join([ '--input_file %s' % f for f in input_files ])
    cmd_dict['outfile'] = output_file
    gatk_cmd = '%(gatk)s ' + \
            '--analysis_type UnifiedGenotyper ' + \
            '--reference_sequence %(reference)s ' + \
            '--DBSNP %(dbsnp)s ' + \
            '--intervals %(target_interval)s ' + \
            '--standard_min_confidence_threshold_for_calling 50 ' + \
            '--standard_min_confidence_threshold_for_emitting 30 ' + \
            '--annotation AlleleBalance ' + \
コード例 #37
0
    to_cluster = False

    statement = '''
    rm -rf prereq_* ctmp*;
    rm -rf test_* _cache _static _templates _tmp report;
    rm -f *.log csvdb *.load *.tsv'''
    P.run()

###################################################################
###################################################################
###################################################################
# primary targets
###################################################################


@follows(mkdir("report"))
def build_report():
    '''build report from scratch.'''

    E.info("starting report build process from scratch")
    P.run_report(clean=True)


@follows(mkdir("report"))
def update_report():
    '''update report.'''

    E.info("updating report")
    P.run_report(clean=False)

コード例 #38
0
    dbh = sqlite3.connect(PARAMS["database_name"])
    return dbh


def connectToUCSC():
    return gtfsubset.connectToUCSC(host=PARAMS["ucsc_host"],
                                   user=PARAMS["ucsc_user"],
                                   database=PARAMS["ucsc_database"])


############################################################
# Assembly
############################################################


@follows(mkdir('assembly.dir'))
@files(os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fasta"),
       PARAMS['interface_contigs'])
def buildContigSizes(infile, outfile):
    '''
    Get contig sizes from indexed genome :term:`fasta` files and
    outputs to a text file.
    Parameters
    ----------
    infile : str
      infile is constructed from the `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      outfile is a text format file that contains two columns, matched
コード例 #39
0
ファイル: test_follows_mkdir.py プロジェクト: bunbun/ruffus
sys.path.insert(0, grandparent_dir)

# module name = script name without extension
module_name = os.path.splitext(os.path.basename(__file__))[0]


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
directories = [os.path.abspath(tempdir + 'a'), tempdir + 'b']


@follows(mkdir(tempdir), mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e'))
def task_which_makes_directories():
    pass


class Test_task_mkdir(unittest.TestCase):

    def setUp(self):
        """
        """
        pass

    def tearDown(self):
        """
        delete directories
        """
コード例 #40
0
# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888


def touch(outfile):
    with open(outfile, "w"):
        pass


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
@follows(mkdir(tempdir))
@ruffus.files([[None, tempdir + "a.1"], [None, tempdir + "b.1"]])
def task1(i, o):
    touch(o)


@follows(mkdir(tempdir))
@ruffus.files([[None, tempdir + "c.1"], [None, tempdir + "d.1"]])
def task2(i, o):
    touch(o)


@transform(task1, regex(r"(.+)"),
           ruffus.inputs(((r"\1"), task2, "test_transform_inputs.*y")),
           r"\1.output")
def task3(i, o):
コード例 #41
0
    output_text = "".join(sorted(output_text))
    output_text += json.dumps(infiles) + " -> " + json.dumps(outfiles) + "\n"
    for f in outfile_names:
        with open(f, "w") as oo:
            oo.write(output_text)



#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

@follows(mkdir(tempdir))
#
#    task1
#
@files(None, tempdir + 'a.1')
def task1(infiles, outfiles, *extra_params):
    """
    First task
    """
    test_job_io(infiles, outfiles, extra_params)



#
#    task2
#
コード例 #42
0
grandparent_dir = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, grandparent_dir)

# module name = script name without extension
module_name = os.path.splitext(os.path.basename(__file__))[0]

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
directories = [os.path.abspath(tempdir + 'a'), tempdir + 'b']


@follows(mkdir(tempdir), mkdir(directories), mkdir(tempdir + 'c'),
         mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e'))
def task_which_makes_directories():
    pass


class Test_task_mkdir(unittest.TestCase):
    def setUp(self):
        """
        """
        pass

    def tearDown(self):
        """
        delete directories
        """
コード例 #43
0
#


def do_write(file_name, what):
    with open(file_name, "a") as oo:
        oo.write(what)


test_file = tempdir + "task.done"


#
#    task1
#
@originate([tempdir + d for d in ('a.1', 'b.1', 'c.1')])
@follows(mkdir(tempdir))
@posttask(lambda: do_write(test_file, "Task 1 Done\n"))
def task1(outfile, *extra_params):
    """
    First task
    """
    with open(tempdir + "jobs.start", "a") as oo:
        oo.write('job = %s\n' % json.dumps([None, outfile]))
    test_job_io(None, outfile, extra_params)
    with open(tempdir + "jobs.finish", "a") as oo:
        oo.write('job = %s\n' % json.dumps([None, outfile]))


#
#    task2
#
コード例 #44
0
    pass


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f
                )

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            """Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            """
            PipelinePreprocess.makeAdaptorFasta(
                infile=infile,
                outfile=outfile,
                track=re.match(REGEX_TRACK, infile).groups()[0],
                dbh=connect(),
                contaminants_file=PARAMS["contaminants"],
            )

        @merge(makeAdaptorFasta, "contaminants.fasta")
        def aggregateAdaptors(infiles, outfile):
コード例 #45
0
def circos_files():
    link_path = os.path.join(DATA_DIR, 'LinkageResults')
    align_path = os.path.join(DATA_DIR, 'LANLSequences', 'Alignments')

    ifiles = []
    for f in os.listdir(link_path):
        if f.endswith('.res'):
            ifiles.append(os.path.join(link_path, f))
    for f in os.listdir(align_path):
        if f.endswith('.aln'):
            ifiles.append(os.path.join(align_path, f))
    yield ifiles, None


@ruffus.follows(ruffus.mkdir(os.path.join(DATA_DIR, 'CircosFigs')))
@ruffus.files(circos_files)
def circos_figs(ifile, ofile):
    dump_path = os.path.join(DATA_DIR, 'CircosFigs')
    load_path = os.path.join(DATA_DIR, 'LinkageResults')
    align_path = os.path.join(DATA_DIR, 'LANLSequences', 'Alignments')
    prots = set()
    for f in os.listdir(align_path):
        if f.endswith(f):
            prots.add(f)
    prots.add('All')

    lcuts = [0.5,0.6,0.7,0.8,0.9]
    graph = CircosGraph.load_from_dir(load_path, align_path)
    sort_fun = lambda x: x['Score']
    for lcut, prot in product(lcuts, prots):
コード例 #46
0
ファイル: prepare_words.py プロジェクト: BKJackson/txtnets
import sh
import ruffus
import os
import random
import pyprind
import gzip
import simplejson as json


data_dir = os.environ['DATA']
words_dir = os.path.join(data_dir, "words")

# /usr/share/dict/words is a text file full of words on most unix systems

@ruffus.follows(ruffus.mkdir(words_dir))
@ruffus.originate(os.path.join(words_dir, "words.txt"))
def get_words(output_file):
    sh.cp("/usr/share/dict/words", output_file)
    sh.chmod("u+w", output_file)

@ruffus.transform(get_words, ruffus.suffix(".txt"), ".alphabet.json")
def build_alphabet_dictionary(input_file, output_file):
    characters = set()
    with open(input_file) as f:
        for line in f:
            characters = characters.union(line.rstrip())

    alphabet = list(sorted(characters)) + ['PADDING', 'START', 'END']

    with open(output_file, 'w') as f:
コード例 #47
0
ファイル: rnaseq.py プロジェクト: VanNostrandLab/rnaseq
def estimate_process():
    """Estimate number of processes based on the maximum size of fastq file."""
    
    size = max(SIZES) / (1000 * 1000 * 1000) * 4
    n = int(options.memory / size)
    if n == 0:
        n = 1
    elif n > options.cores:
        n = options.cores
    return n


PROCESS = estimate_process()


@ruffus.follows(ruffus.mkdir('fastq_to_bam'))
@ruffus.originate(list(FASTQS.keys()))
def soft_link(link):
    """ Create soft links for original fastq files. """
    
    def make_link(path, link):
        if path:
            if path == os.path.abspath(link):
                message = "No symbolic link was made for {path}! You are directly working on the original file!"
                logger.warning(message)
            else:
                if not os.path.exists(link):
                    message = f'Soft link fastq: {os.path.basename(path)} ...'
                    cmding(f'ln -s {path} {link}', message=message)
    link1, link2 = link, link.replace('.r1.fastq.gz', '.r2.fastq.gz')
    make_link(FASTQS[link]['fastq1'], link1)
コード例 #48
0
                  "*.fastq.2.gz",
                  "*.fastq.gz")
FASTQ_DIR = PARAMS['fastq_dir']
# set to value for testing purposes (see regexes below)
if FASTQ_DIR == "?!":
    FASTQ_DIR = ""

FASTQ_FILES = tuple([os.path.join(FASTQ_DIR, suffix_name)
                     for suffix_name in FASTQ_SUFFIXES])
FASTQ_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.1.gz"))
FASTQ_PAIR = os.path.join(FASTQ_DIR, r"\1.fastq.2.gz")
SE_REGEX = regex(os.path.join(FASTQ_DIR, r"(\S+).fastq.gz"))
GENESETS = [y for y in glob.glob(os.path.join("reference.dir/*.gtf.gz"))]


@follows(mkdir("transcripts.dir"))
@transform("%s" % PARAMS['annotations_geneset_gtf'],
           regex("reference.dir/(.+).gtf.gz"),
           r"transcripts.dir/\1.fa")
def makeRepTranscripts(infile, outfile):
    '''
    make a single representative transcript for each
    gene - put into a multi-fasta file
    '''

    genome_file = "/".join([PARAMS['genome_dir'], PARAMS['genome']])

    statement = '''
    zcat %(infile)s |
    cgat gff2fasta
    --genome-file=%(genome_file)s
コード例 #49
0
    statement = '''
    rm -rf prereq_* ctmp*;
    rm -rf test_* _cache _static _templates _tmp report;
    rm -f *.log csvdb *.load *.tsv'''
    P.run(statement)


###################################################################
###################################################################
###################################################################
# primary targets
###################################################################


@follows(mkdir("report"))
def build_report():
    '''build report from scratch.'''

    E.info("starting report build process from scratch")
    run_report(clean=True)


@follows(mkdir("report"))
def update_report():
    '''update report.'''

    E.info("updating report")
    run_report(clean=False)

コード例 #50
0

@active_if("iclip_transcript_region_metagene" in PARAMS["methods"])
@merge(do_iclip_metagene, "transcript_regions.load")
def merge_and_load_region_metagenes(infiles, outfile):

    P.concatenate_and_load(
        infiles,
        outfile,
        regex_filename=".+/(.+)-(.+)-(.+)\.vs\.(.+).tsv.gz",
        cat="source,condition,replicate,geneset",
        options=" -i source -i condition -i replicate -i geneset")


#  -------------------------------------------------------------------------
@follows(mkdir(os.path.join(PARAMS["export"], "images")))
@split(merge_and_load_metagenes,
       os.path.join(PARAMS["export"],
                    "images/*.%s" % PARAMS["plotting"]["format"]))
def do_plots(infile, outfiles):

    code_location = os.path.dirname(__file__)
    script_file = os.path.join(code_location, "plot_metagenes.R")

    statement = "Rscript %(script_file)s"
    P.run(statement)


# ---------------------------------------------------
# Generic pipeline tasks
@follows(do_plots, merge_and_load_region_metagenes)
コード例 #51
0
    cc.close()

    return dbh

# Determine whether the gemone is paired


SPLICED_MAPPING = PARAMS["bam_paired_end"]


#########################################################################
# Count reads as some QC targets require it
#########################################################################


@follows(mkdir("nreads.dir"))
@transform("*.bam",
           suffix(".bam"),
           r"nreads.dir/\1.nreads")
def countReads(infile, outfile):
    '''Count number of reads in input files.'''

    statement = '''printf "nreads \\t" >> %(outfile)s'''

    P.run(statement)

    statement = '''samtools view %(infile)s | wc -l | xargs printf >> %(outfile)s'''

    P.run(statement)

#########################################################################
コード例 #52
0

# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
                    "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads,
                   regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''
            PipelinePreprocess.makeAdaptorFasta(
                infile=infile,
                outfile=outfile,
                track=re.match(REGEX_TRACK, infile).groups()[0],
                dbh=connect(),
                contaminants_file=PARAMS['contaminants'])

        @merge(makeAdaptorFasta, "contaminants.fasta")
コード例 #53
0
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError("file %s missing, "
                                 "you need to run the pipeline once before "
                                 "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads, regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''

            print(infile)
            print(REGEX_TRACK)

            PipelinePreprocess.makeAdaptorFasta(
                infile=infile,
                outfile=outfile,
                track=re.match(REGEX_TRACK, infile).groups()[0],
                dbh=connect(),
コード例 #54
0
# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if P.get_params().get("preprocessors", None):
    if P.get_params()["auto_remove"]:
        # check if FastQC has been run
        for x in iotools.flatten([glob.glob(y) for y in
                                  P.get_params()["input_globs"].get("default", INPUT_FORMATS)]):
            f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
                    "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads,
                   regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''

            preprocess.makeAdaptorFasta(
                infile=infile,
                outfile=outfile,
                track=re.match(REGEX_TRACK, infile).groups()[0],
                dbh=connect(),
                contaminants_file=P.get_params()['contaminants_path'])
コード例 #55
0
#   Main logic


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888


# _________________________________________________________________________________________
#
#   setup_simulation_data
#
# _________________________________________________________________________________________

#
# mkdir: makes sure output directories exist before task
#
@follows(mkdir(gene_data_dir, simulation_data_dir))
def setup_simulation_data():
    """
    create simulation files
    """
    for i in range(CNT_GENE_GWAS_FILES):
        open(os.path.join(gene_data_dir, "%03d.gene" % i), "w").close()
        open(os.path.join(gene_data_dir, "%03d.gwas" % i), "w").close()

    # gene files without corresponding gwas and vice versa
    open(os.path.join(gene_data_dir, "orphan1.gene"), "w").close()
    open(os.path.join(gene_data_dir, "orphan2.gwas"), "w").close()
    open(os.path.join(gene_data_dir, "orphan3.gwas"), "w").close()

    for i in range(CNT_SIMULATION_FILES):
        open(os.path.join(simulation_data_dir, "%03d.simulation" % i), "w").close()
コード例 #56
0
ファイル: test_active_if.py プロジェクト: bunbun/ruffus
            oo.write(output_text + file_output_text)


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
pipeline_active_if = True
#
#    task1
#


@follows(mkdir("test_active_if"))
@originate(['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")
def task1(outfile, extra):
    """
    First task
    """
    # N.B. originate works with an extra parameter
    helper(None, outfile)


#
#    task2
#
@transform(task1, suffix(".1"), ".2")
def task2(infile, outfile):
    """
コード例 #57
0
ファイル: test_collate.py プロジェクト: bunbun/ruffus

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
#
#    task1
#

def do_write(file_name, what):
    with open(file_name, "a") as oo:
        oo.write(what)


test_file = tempdir + "task.done"


@follows(mkdir(tempdir, tempdir + "test"))
@posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))
@split(None, tempdir + '*.animal')
def prepare_files(no_inputs, outputs):
    # cleanup previous
    for f in outputs:
        os.unlink(f)

    for grouping in species_list:
        for species_name in species_list[grouping]:
            filename = tempdir + "%s.%s.animal" % (species_name, grouping)
            with open(filename, "w") as oo:
                oo.write(species_name + "\n")


#
コード例 #58
0
ファイル: test_active_if.py プロジェクト: xnox/ruffus
        with open(outfile, "w") as oo:
            oo.write(output_text + file_output_text)


# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks

# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
pipeline_active_if = True
#
#    task1
#


@follows(mkdir("test_active_if"))
@originate(['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")
def task1(outfile, extra):
    """
    First task
    """
    # N.B. originate works with an extra parameter
    helper(None, outfile)


#
#    task2
#
@transform(task1, suffix(".1"), ".2")
def task2(infile, outfile):
    """
コード例 #59
0
ファイル: prepare_text8.py プロジェクト: BKJackson/txtnets
import sh
import ruffus
import os
import random
import simplejson as json
import pyprind
import gzip

data_dir = os.environ['DATA']
text8_dir = os.path.join(data_dir, "text8")


N_TRAIN_CHAR_FRAGMENTS = 100000
CHAR_FRAGMENTS_CONTEXT_LENGTH = 50

@ruffus.follows(ruffus.mkdir(text8_dir))
@ruffus.originate(os.path.join(text8_dir, "text8.zip"))
def download_text8(output_file):
    sh.wget("-O", output_file, "http://mattmahoney.net/dc/text8.zip")

@ruffus.transform(download_text8, ruffus.suffix(".zip"), ".txt")
def extract_text8(input_file, output_file):
    sh.cd(text8_dir)
    sh.unzip(input_file)
    print sh.ls()
    sh.mv("text8", output_file)

@ruffus.transform(extract_text8, ruffus.suffix(".txt"), ".alphabet.json")
def build_alpabet_dictionary(input_file, output_file):
    characters = set()
    with open(input_file) as f: