def test_job_should_fail_if_too_little_memory_required(self):

        outfile = os.path.join(self.work_dir, "out")

        if P.get_parameters()['os'] == 'Linux':
            self.assertRaises(
                OSError,
                P.run,
                "python -c 'import numpy; "
                "a = numpy.array(numpy.arange(0, {memory}), numpy.int8); "
                "out = open(\"{outfile}\", \"w\"); "
                "out.write(str(len(a)) + \"\\n\"); "
                "out.close()'".format(memory=self.test_memory_size,
                                      outfile=outfile),
                to_cluster=self.to_cluster,
                job_memory="{}G".format(0.5 * self.test_memory_size / 10**9))
        else:
            pass
Exemple #2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options, args = P.parse_commandline(argv, config_file="template.yml")

    global PARAMS
    if options.config_file:
        PARAMS = P.get_parameters(options.config_file,
                                  defaults={
                                      "min_value": 0.0,
                                      "num_samples": 1000,
                                      "mu": 0.0,
                                      "sigma": 1.0
                                  })
    else:
        sys.exit(P.main(options, args))

    pipeline = ruffus.Pipeline("template_pipeline")

    task_create_files = pipeline.originate(
        task_func=create_files,
        output=["sample_{:02}.txt".format(x) for x in range(10)])

    task_compute_mean = pipeline.transform(task_func=compute_mean,
                                           input=task_create_files,
                                           filter=ruffus.suffix(".txt"),
                                           output=".mean")

    task_combine_means = pipeline.merge(task_func=combine_means,
                                        input=task_compute_mean,
                                        output="means.txt")

    # primary targets
    pipeline.merge(task_func=P.EmptyRunner("all"),
                   input=task_combine_means,
                   output="all")

    E.debug("starting workflow")
    return P.run_workflow(options, args)
    def test_job_should_fail_if_too_little_memory_required_in_second_statement(
            self):

        outfile = os.path.join(self.work_dir, "out")
        infile = "arv=by_id/glon1-4zz18-3cbje7tmr0nitut/study_list.txt"

        if P.get_parameters()['os'] == 'Linux':
            self.assertRaises(
                OSError,
                P.run,
                "hostname > {outfile}; "
                "python -c 'import numpy; "
                "a = numpy.array(numpy.arange(0, {memory}), numpy.int8); "
                "out = open(\"{outfile}\", \"w\"); "
                "out.write(str(len(a)) + \"\\n\"); "
                "out.close()'".format(memory=self.test_memory_size,
                                      infile=infile,
                                      outfile=outfile),
                to_cluster=self.to_cluster,
                job_memory="{}G".format(0.5 * self.test_memory_size / 10**9))
        else:
            pass
Communicates with the obolibrary (obo foundry) API for hierarchical
ontology annotations.  Can be used to download and parse any OWL
formatted ontology available on this site.

"""

from ruffus import *
from CGATCore import Pipeline as P
import os
import sys
import CGATPipelines.PipelineGeneInfo as PipelineGeneInfo
import CGATCore.IOTools as IOTools
import pandas as pd

PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

# pick a pathway to use to check pathway annotation has run
example_pw = PARAMS['my_gene_info_pathway'].split(",")[0]
if example_pw == "all":
    example_pw = 'kegg'
example_homolo = str(PARAMS['my_gene_info_homologene']).split(",")
if len(example_homolo) == 0 or example_homolo[0] == 'all':
    example_homolo = 10090
else:
    example_homolo = example_homolo[0]

# get the list of annotations to be downloaded from my gene info
mgiannotations = PARAMS['my_gene_info_annotations']
Exemple #5
0
 def setUp(self):
     BaseTest.setUp(self)
     P.get_parameters()
import CGATCore.Experiment as E
import CGATCore.IOTools as IOTools

import CGATPipelines.PipelineMotifs as PipelineMotifs
import CGATPipelines.PipelineTracks as PipelineTracks
from CGATPipelines.Report import run_report

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
from CGATCore import Pipeline as P
P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
],
                 defaults={'annotations_dir': ""})

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peek_parameters(PARAMS["annotations_dir"], "genesets")

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample
Exemple #7
0
                        'pipeline_docs', 'themes')
logopath = os.path.join(themedir, "cgat_logo.png")

################################################################
# Import pipeline configuration from pipeline.ini in the current
# directory and the common one.

# PATH were code for pipelines is stored
pipelinesdir = os.path.dirname(CGATPipelines.__file__)

# The default configuration file - 'inifile' is read by
# sphinx-report.
inifile = os.path.join(os.path.dirname(CGATPipelines.__file__),
                       'configuration', 'pipeline.yml')

PARAMS = P.get_parameters([inifile, "pipeline.yml"])

# Definition now part of CGATReport
# def setup(app):
#     app.add_config_value('PARAMS', {}, True)

################################################################
################################################################
################################################################
# The pipeline assumes that sphinxreport is called within the
# working directory. If the report is in a separate build directory,
# change the paths below.
#
# directory with export directory from pipeline
# This should be a directory in the build directory - you can
# link from here to a directory outside the build tree, though.
Exemple #8
0

def connect():
    '''connect to database.
    Use this method to connect to additional databases.
    Returns a database connection.
    '''
    dbh = sqlite3.connect(PARAMS["database_name"])

    return dbh


#########################################################################
P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
],
                 defaults={'paired_end': False},
                 only_import=__name__ != "__main__")

PARAMS = P.PARAMS

PipelineMapping.PARAMS = PARAMS
PipelineMappingQC.PARAMS = PARAMS
PipelineExome.PARAMS = PARAMS
#########################################################################

#########################################################################
# Load manual annotations
#########################################################################

=====================================================================================
'''

import shutil
import os
import sqlite3
import CGATCore.IOTools as IOTools
import CGAT.IndexedGenome as IndexedGenome
import CGAT.Bed as Bed
from CGATCore import Pipeline as P

############################################################
############################################################
############################################################
# Pipeline configuration
P.get_parameters(["%s.yml" % __file__[:-len(".py")], "pipeline.yml"])
PARAMS = P.PARAMS

############################################################
############################################################
############################################################


def exportIntervalsAsBed(database, query, outfile):
    '''export intervals from SQlite database as bed files. '''

    dbhandle = sqlite3.connect(database)
    cc = dbhandle.cursor()
    cc.execute(query)

    outs = IOTools.open_file(outfile, "w")
Exemple #10
0
from CGATCore import Pipeline as P
import CGATCore.IOTools as IOTools
import CGAT.Bed as Bed
import cgatpipelines.tasks.peakcalling as PipelinePeakcalling
import PipelineDeNovoMotifs_python3 as PipelineMotifs
import cgatpipelines.tasks.tracks as PipelineTracks


###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
P.get_parameters(
    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
     "../pipeline.yml",
     "pipeline.yml"],
    defaults={
        'paired_end': False})

PARAMS = P.PARAMS


PipelinePeakcalling.PARAMS = PARAMS
PipelineMotifs.PARAMS = PARAMS


###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
Exemple #11
0
def main(argv=None):

    parser = get_option_parser()

    (options, args) = E.start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    P.get_parameters()
    P.start_session()

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        statements = [build_command(x) for x in data]
        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.stop()
            sys.exit(0)

        P.run(statements)
    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = IOTools.open_file(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = IOTools.open_file(options.output_pattern % filename,
                                            "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.stop()
Exemple #12
0
def main(argv=None):

    # Parse the options
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p",
        "--params",
        "--args",
        dest="params",
        type="string",
        help="comma separated list of addtional parameter strings")

    parser.add_option("-m",
                      "--module",
                      dest="module",
                      type="string",
                      help="the full path to the module file",
                      default=None)

    parser.add_option("-i",
                      "--input",
                      dest="input_filenames",
                      type="string",
                      action="append",
                      help="input filename")

    parser.add_option("-o",
                      "--output-section",
                      dest="output_filenames",
                      type="string",
                      action="append",
                      help="output filename")

    parser.add_option("-f",
                      "--function",
                      dest="function",
                      type="string",
                      help="the module function",
                      default=None)

    parser.set_defaults(input_filenames=[], output_filenames=[], params=None)

    (options, args) = E.start(parser)

    # Check a module and function have been specified
    if not options.module or not options.function:
        raise ValueError("Both a function and Module must be specified")

    # initialize defaults
    P.get_parameters()

    # If a full path was given, add this path to the system path
    location = os.path.dirname(options.module)
    if location != "":
        sys.path.append(location)

    # Establish the module name, accomodating cases where the
    # .py extension has been included in the module name
    module_name = os.path.basename(options.module)
    if module_name.endswith(".py"):
        module_base_name = module_name[:-3]
    else:
        module_base_name = module_name

    # Import the specified module and map the specified fuction
    E.info("importing module '%s' " % module_base_name)
    E.debug("sys.path is: %s" % sys.path)

    module = importlib.import_module(module_base_name)
    try:
        function = getattr(module, options.function)
    except AttributeError as msg:
        raise AttributeError(
            msg.message + "unknown function, available functions are: %s" %
            ",".join([x for x in dir(module) if not x.startswith("_")]))

    if options.input_filenames and not options.input_filenames == ["None"]:
        infiles = options.input_filenames
    else:
        infiles = False

    if options.output_filenames and not options.output_filenames == ["None"]:
        outfiles = options.output_filenames
    else:
        outfiles = False

    # Parse the parameters into an array
    if options.params:
        params = [param.strip() for param in options.params.split(",")]
    else:
        params = False

    # deal with single file case
    if infiles and len(infiles) == 1:
        infiles = infiles[0]
    if outfiles and len(outfiles) == 1:
        outfiles = outfiles[0]

    # Make the function call
    if infiles and outfiles and params:
        function(infiles, outfiles, params)
    elif infiles and outfiles and not params:
        function(infiles, outfiles)
    elif params:
        function(params)
    else:
        raise ValueError(
            "Expecting infile+outfile+params or infile+outfile or params")

    E.stop()