Beispiel #1
0
def execute(statement, **kwargs):
    '''execute a statement locally.

    This method implements the same parameter interpolation
    as the function :func:`run`.

    Arguments
    ---------
    statement : string
        Command line statement to run.

    Returns
    -------
    stdout : string
        Data sent to standard output by command
    stderr : string
        Data sent to standard error by command
    '''

    if not kwargs:
        kwargs = get_caller_locals()

    kwargs = dict(list(get_params().items()) + list(kwargs.items()))

    logger = get_logger()
    logger.info("running %s" % (statement % kwargs))

    if "cwd" not in kwargs:
        cwd = get_params()["work_dir"]
    else:
        cwd = kwargs["cwd"]

    # cleaning up of statement
    # remove new lines and superfluous spaces and tabs
    statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip()
    if statement.endswith(";"):
        statement = statement[:-1]

    # always use bash
    os.environ.update(
        {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')})
    process = subprocess.Popen(statement % kwargs,
                               cwd=cwd,
                               shell=True,
                               stdin=sys.stdin,
                               stdout=sys.stdout,
                               stderr=sys.stderr,
                               env=os.environ.copy(),
                               executable="/bin/bash")

    # process.stdin.close()
    stdout, stderr = process.communicate()

    if process.returncode != 0:
        raise OSError(
            "Child was terminated by signal %i: \n"
            "The stderr was: \n%s\n%s\n" %
            (-process.returncode, stderr, statement))

    return stdout, stderr
Beispiel #2
0
    def __init__(self, **kwargs):

        self.logger = get_logger()
        self.queue_manager = None
        self.run_on_cluster = will_run_on_cluster(kwargs)
        self.job_threads = kwargs.get("job_threads", 1)

        if "job_memory" in kwargs and "job_total_memory" in kwargs:
            raise ValueError(
                "both job_memory and job_total_memory have been given")

        self.job_total_memory = kwargs.get('job_total_memory', None)
        self.job_memory = kwargs.get('job_memory', None)

        if self.job_total_memory == "unlimited" or self.job_memory == "unlimited":
            self.job_total_memory = self.job_memory = "unlimited"
        else:
            if self.job_total_memory:
                self.job_memory = iotools.bytes2human(
                    iotools.human2bytes(self.job_total_memory) /
                    self.job_threads)
            elif self.job_memory:
                self.job_total_memory = self.job_memory * self.job_threads
            else:
                self.job_memory = get_params()["cluster"].get(
                    "memory_default", "4G")
                if self.job_memory == "unlimited":
                    self.job_total_memory = "unlimited"
                else:
                    self.job_total_memory = self.job_memory * self.job_threads

        self.ignore_pipe_errors = kwargs.get('ignore_pipe_errors', False)
        self.ignore_errors = kwargs.get('ignore_errors', False)

        self.job_name = kwargs.get("job_name", "unknow_job_name")
        self.task_name = kwargs.get("task_name", "unknown_task_name")

        # deduce output directory/directories, requires somewhat
        # consistent naming in the calling function.
        outfiles = []
        if "outfile" in kwargs:
            outfiles.append(kwargs["outfile"])
        if "outfiles" in kwargs:
            outfiles.extend(kwargs["outfiles"])

        self.output_directories = set(
            sorted([os.path.dirname(x) for x in outfiles]))

        self.options = kwargs

        self.work_dir = get_params()["work_dir"]

        self.shellfile = kwargs.get("shell_logfile", None)
        if self.shellfile:
            if not self.shellfile.startswith(os.sep):
                self.shellfile = os.path.join(self.work_dir,
                                              os.path.basename(self.shellfile))
Beispiel #3
0
def file_is_mounted(filename):
    """return True if filename is mounted.

    A file is likely to be mounted if it is located
    inside a subdirectory of the local scratch directory.
    """
    if get_params()["mount_point"]:
        return os.path.abspath(filename).startswith(get_params()["mount_point"])
    else:
        return False
Beispiel #4
0
def get_temp_file(dir=None,
                  shared=False,
                  suffix="",
                  mode="w+",
                  encoding="utf-8"):
    '''get a temporary file.

    The file is created and the caller needs to close and delete the
    temporary file once it is not used any more. By default, the file
    is opened as a text file (mode ``w+``) with encoding ``utf-8``
    instead of the default mode ``w+b`` used in
    :class:`tempfile.NamedTemporaryFile`

    If dir does not exist, it will be created.

    Arguments
    ---------
    dir : string
        Directory of the temporary file and if not given is set to the
        default temporary location in the global configuration dictionary.
    shared : bool
        If set, the tempory file will be in a shared temporary
        location (given by the global configuration directory).
    suffix : string
        Filename suffix

    Returns
    -------
    file : File
        A file object of the temporary file.

    '''
    if dir is None:
        if shared:
            dir = get_params()['shared_tmpdir']
        else:
            dir = get_params()['tmpdir']

    if not os.path.exists(dir):
        try:
            os.makedirs(dir)
        except OSError:
            # avoid race condition when several processes try to create
            # temporary directory.
            pass
        if not os.path.exists(dir):
            raise OSError(
                "temporary directory {} could not be created".format(dir))

    return tempfile.NamedTemporaryFile(dir=dir,
                                       delete=False,
                                       prefix="ctmp",
                                       mode=mode,
                                       encoding=encoding,
                                       suffix=suffix)
Beispiel #5
0
def interpolate_statement(statement, kwargs):
    '''interpolate command line statement with parameters

    The skeleton of the statement should be defined in kwargs.  The
    method then applies string interpolation using a dictionary built
    from the global configuration dictionary PARAMS, but augmented by
    `kwargs`. The latter takes precedence.

    Arguments
    ---------
    statement: string
        Command line statement to be interpolated.
    kwargs : dict
        Keyword arguments that are used for parameter interpolation.

    Returns
    -------
    statement : string
        The command line statement with interpolated parameters.

    Raises
    ------
    KeyError
        If ``statement`` contains unresolved references.

    '''

    local_params = substitute_parameters(**kwargs)

    # build the statement
    try:
        statement = statement % local_params
    except KeyError as msg:
        raise KeyError(
            "Error when creating command: could not "
            "find %s in dictionaries" % msg)
    except ValueError as msg:
        raise ValueError(
            "Error when creating command: %s, statement = %s" % (
                msg, statement))

    # cleaning up of statement
    # remove new lines and superfluous spaces and tabs
    statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip()
    if statement.endswith(";"):
        statement = statement[:-1]

    # mark arvados mount points in statement
    if get_params().get("mount_point", None):
        statement = re.sub(get_params()["mount_point"], "arv=", statement)

    return statement
Beispiel #6
0
def get_database_name():
    '''Return the database name associated with the pipeline.

    This method lookis in different sections in the ini file to permit
    both old style ``database`` and new style ``database_name``.

    This method has been implemented for backwards compatibility.

    Returns
    -------
    databasename : string
        database name. Returns empty string if not found.

    Raises
    ------
    KeyError
       If no database name is found

    '''

    locations = ["database_name", "database"]
    params = get_params()
    for location in locations:
        database = params.get(location, None)
        if database is not None:
            return database

    raise KeyError("database name not found")
Beispiel #7
0
def setup_logging(options, pipeline=None):

    logger = logging.getLogger("cgatcore.pipeline")

    if options.log_config_filename is None:

        # set up default file logger
        handler = logging.FileHandler(filename=options.pipeline_logfile,
                                      mode="a")

        if pipeline is not None:
            pipeline_name = pipeline.name
        else:
            pipeline_name = get_params().get("pipeline_name", "main")

        handler.setFormatter(
            E.MultiLineFormatter("%(asctime)s %(levelname)s "
                                 "%(app_name)s %(module)s "
                                 "- %(message)s"))

        logger.addFilter(LoggingFilterpipelineName(name=pipeline_name))
        logger.addHandler(handler)

        logger.info("pipeline log is {}".format(options.pipeline_logfile))

    return logger
Beispiel #8
0
def connect():
    """connect to SQLite database used in this pipeline.

    .. note::
       This method is currently only implemented for sqlite
       databases. It needs refactoring for generic access.
       Alternatively, use an full or partial ORM.

    If ``annotations_database`` is in params, this method
    will attach the named database as ``annotations``.

    Returns
    -------
    dbh
       a database handle

    """

    # Note that in the future this might return an sqlalchemy or
    # db.py handle.
    url = get_params()["database"]["url"]
    is_sqlite3 = url.startswith("sqlite")
    
    if is_sqlite3:
        connect_args = {'check_same_thread': False}
    else:
        connect_args = {}
        
    creator = None
    if is_sqlite3 and "annotations_dir" in get_params():
        # not sure what the correct way is for url
        # sqlite:///./csvdb -> ./csvdb
        # sqlite:////path/to/csvdb -> /path/to/csvdb
        filename = os.path.abspath(url[len("sqlite:///"):])
        
        def creator():
            conn = sqlite3.connect(filename)
            conn.execute("ATTACH DATABASE '{}' as annotations".format(
                os.path.join(get_params()["annotations_dir"], "csvdb")))
            return conn
        
    engine = sqlalchemy.create_engine(
        url,
        connect_args=connect_args,
        creator=creator)

    return engine
Beispiel #9
0
def get_temp_dir(dir=None, shared=False, clear=False):
    '''get a temporary directory.

    The directory is created and the caller needs to delete the temporary
    directory once it is not used any more.

    If dir does not exist, it will be created.

    Arguments
    ---------
    dir : string
        Directory of the temporary directory and if not given is set to the
        default temporary location in the global configuration dictionary.
    shared : bool
        If set, the tempory directory will be in a shared temporary
        location.

    Returns
    -------
    filename : string
        Absolute pathname of temporary file.

    '''
    if dir is None:
        if shared:
            dir = get_params()['shared_tmpdir']
        else:
            dir = get_params()['tmpdir']

    if not os.path.exists(dir):
        os.makedirs(dir)

    tmpdir = tempfile.mkdtemp(dir=dir, prefix="ctmp")
    if clear:
        os.rmdir(tmpdir)
    return tmpdir
Beispiel #10
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = get_params().get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = iotools.open_file(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = iotools.open_file(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = iotools.zap_file(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    get_logger().info("zapped: %s" % (c))
    outfile.close()

    return c
Beispiel #11
0
def build_load_statement(tablename, retry=True, options=""):
    """build a command line statement to upload data.

    Upload is performed via the :doc:`csv2db` script.

    The returned statement is suitable to use in pipe expression.
    This method is aware of the configuration values for database
    access and the chosen database backend.

    For example::

        load_statement = P.build_load_statement("data")
        statement = "cat data.txt | %(load_statement)s"
        P.run(statement)

    Arguments
    ---------
    tablename : string
        Tablename for upload
    retry : bool
        Add the ``--retry`` option to `csv2db.py`
    options : string
        Command line options to be passed on to `csv2db.py`

    Returns
    -------
    string

    """

    opts = []

    if retry:
        opts.append(" --retry ")

    params = get_params()
    opts.append("--database-url={}".format(params["database"]["url"]))

    db_options = " ".join(opts)
    load_statement = (
        "python -m cgatcore.csv2db {db_options} {options} --table={tablename}".
        format(**locals()))

    return load_statement
Beispiel #12
0
def print_config_files():
    '''
        Print the list of .ini files used to configure the pipeline
        along with their associated priorities.
        Priority 1 is the highest.
    '''

    filenames = get_params()['pipeline_yml']
    print("\n List of .yml files used to configure the pipeline")
    s = len(filenames)
    if s == 0:
        print(" No yml files passed!")
    elif s >= 1:
        print(" %-11s: %s " % ("Priority", "File"))
        for f in filenames:
            if s == 1:
                print(" (highest) %s: %s\n" % (s, f))
            else:
                print(" %-11s: %s " % (s, f))
            s -= 1
Beispiel #13
0
def get_mounted_location(filename):
    """return location of filename within mounted directory

    """
    return os.path.abspath(filename)[len(get_params()["mount_point"]):]
Beispiel #14
0
def peek_parameters(workingdir,
                    pipeline,
                    on_error_raise=None,
                    prefix=None,
                    update_interface=False,
                    restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = get_caller_locals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not is_test() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if ("config" in sys.argv or "check" in sys.argv
            or "clone" in sys.argv and workingdir == "?!"):
        workingdir = os.path.join(get_params()["pipelinedir"],
                                  "pipeline_" + pipeline)

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError("can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "cgatflow {} -v 0 dump".format(pipeline)

    os.environ.update(
        {'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')})
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               env=os.environ.copy())

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") % (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                if isinstance(value, str):
                    dump[key] = os.path.join(workingdir, value)
                elif isinstance(value, collections.Mapping):
                    for kkey, vvalue in list(value.items()):
                        value[key] = os.path.join(workingdir, vvalue)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Beispiel #15
0
def run_workflow(options, args, pipeline=None):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    pipeline: object
        pipeline to run. If not given, all ruffus pipelines are run.

    """
    logger = logging.getLogger("cgatcore.pipeline")

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    if options.force_run:
        if options.force_run == "all":
            forcedtorun_tasks = ruffus.pipeline_get_task_names()
        else:
            forcedtorun_tasks = options.pipeline_targets
    else:
        forcedtorun_tasks = []

    # create local scratch if it does not already exists. Note that
    # directory itself will be not deleted while its contents should
    # be cleaned up.
    if not os.path.exists(get_params()["tmpdir"]):
        logger.warn(
            "local temporary directory {} did not exist - created".format(
                get_params()["tmpdir"]))
        try:
            os.makedirs(get_params()["tmpdir"])
        except OSError:
            # file exists
            pass

    logger.debug("temporary directory is {}".format(get_params()["tmpdir"]))

    # set multiprocess to a sensible setting if there is no cluster
    run_on_cluster = HAS_DRMAA is True and not options.without_cluster
    if options.multiprocess is None:
        if not run_on_cluster:
            options.multiprocess = int(
                math.ceil(multiprocessing.cpu_count() / 2.0))
        else:
            options.multiprocess = 40

    # see inputValidation function in Parameters.py
    if options.input_validation:
        input_validation(get_params(), sys.argv[0])

    elif options.pipeline_action == "debug":
        # create the session proxy
        start_session()

        method_name = options.pipeline_targets[0]
        caller = get_caller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "state", "svg", "plot",
                                     "dot", "touch", "regenerate"):

        messenger = None
        try:
            with cache_os_functions():
                if options.pipeline_action == "make":

                    if not options.without_cluster and not HAS_DRMAA and not get_params(
                    )['testing']:
                        E.critical(
                            "DRMAA API not found so cannot talk to a cluster.")
                        E.critical("Please use --local to run the pipeline"
                                   " on this host: {}".format(os.uname()[1]))
                        sys.exit(-1)

                    # get tasks to be done. This essentially replicates
                    # the state information within ruffus.
                    stream = StringIO()
                    ruffus.pipeline_printout(
                        stream,
                        options.pipeline_targets,
                        verbose=5,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                    messenger = LoggingFilterProgress(stream.getvalue())
                    logger.addFilter(messenger)

                    global task
                    if options.without_cluster:
                        # use ThreadPool to avoid taking multiple CPU for pipeline
                        # controller.
                        opts = {"multithread": options.multiprocess}
                    else:
                        # use cooperative multitasking instead of multiprocessing.
                        opts = {
                            "multiprocess": options.multiprocess,
                            "pool_manager": "gevent"
                        }
                        # create the session proxy
                        start_session()

                    logger.info("current directory is {}".format(os.getcwd()))

                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        logger=logger,
                        verbose=options.loglevel,
                        log_exceptions=options.log_exceptions,
                        exceptions_terminate_immediately=options.
                        exceptions_terminate_immediately,
                        checksum_level=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        one_second_per_job=False,
                        **opts)

                    close_session()

                elif options.pipeline_action == "show":
                    ruffus.pipeline_printout(
                        options.stdout,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "touch":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=True,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "regenerate":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        verbose=options.loglevel)

                elif options.pipeline_action == "svg":
                    ruffus.pipeline_printout_graph(
                        options.stdout.buffer,
                        options.pipeline_format,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "state":
                    ruffus.ruffus_return_dag(
                        options.stdout,
                        target_tasks=options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "plot":
                    outf, filename = tempfile.mkstemp()
                    ruffus.pipeline_printout_graph(
                        os.fdopen(outf, "wb"),
                        options.pipeline_format,
                        options.pipeline_targets,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)
                    execute("inkscape %s" % filename)
                    os.unlink(filename)

        except ruffus.ruffus_exceptions.RethrownJobError as ex:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(ex.args))
                for idx, e in enumerate(ex.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub(r"\s", "", job)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.pipeline_logfile)

                logger.error("start of all error messages")
                logger.error(ex)
                logger.error("end of all error messages")

                raise ValueError("pipeline failed with %i errors" %
                                 len(ex.args)) from ex
            else:
                raise

    elif options.pipeline_action == "dump":
        options.stdout.write((json.dumps(get_params())) + "\n")

    elif options.pipeline_action == "printconfig":
        E.info("printing out pipeline parameters: ")
        p = get_params()
        for k in sorted(get_params()):
            print(k, "=", p[k])
        print_config_files()

    elif options.pipeline_action == "config":
        # Level needs to be 2:
        # 0th level -> cgatflow.py
        # 1st level -> Control.py
        # 2nd level -> pipeline_xyz.py
        f = sys._getframe(2)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        write_config_files(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clone_pipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.stop(logger=get_logger())
Beispiel #16
0
def initialize(argv=None, caller=None, defaults=None, **kwargs):
    """setup the pipeline framework.

    Arguments
    ---------
    options: object
        Container for command line arguments.
    args : list
        List of command line arguments.
    defaults : dictionary
        Dictionary with default values to be added to global
        parameters dictionary.

    Additional keyword arguments will be passed to the
    :func:`~.parse_commandline` function to set command-line defaults.

    """
    if argv is None:
        argv = sys.argv

    # load default options from config files
    if caller:
        path = os.path.splitext(caller)[0]
    else:
        try:
            path = os.path.splitext(get_caller().__file__)[0]
        except AttributeError as ex:
            path = "unknown"

    options, args = parse_commandline(argv, **kwargs)

    get_parameters([
        os.path.join(path, "pipeline.yml"), "../pipeline.yml",
        options.config_file
    ],
                   defaults=defaults)

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS
    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    logger = logging.getLogger("cgatcore.pipeline")

    logger.info("started in directory: {}".format(
        get_params().get("start_dir")))

    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.
    update_params_with_commandline_options(get_params(), options)

    code_location, version = get_version()
    logger.info("code location: {}".format(code_location))
    logger.info("code version: {}".format(version))

    logger.info("working directory is: {}".format(
        get_params().get("work_dir")))
    work_dir = get_params().get("work_dir")
    if not os.path.exists(work_dir):
        E.info(
            "working directory {} does not exist - creating".format(work_dir))
        os.makedirs(work_dir)
    logger.info("changing directory to {}".format(work_dir))
    os.chdir(work_dir)

    logger.info("pipeline has been initialized")

    return options, args
Beispiel #17
0
def concatenate_and_load(infiles,
                         outfile,
                         regex_filename=None,
                         header=None,
                         cat="track",
                         has_titles=True,
                         missing_value="na",
                         retry=True,
                         tablename=None,
                         options="",
                         job_memory=None):
    """concatenate multiple tab-separated files and upload into database.

    The table name is given by outfile without the
    ".load" suffix.

    A typical concatenate and load task in ruffus would look like this::

        @merge("*.tsv.gz", ".load")
        def loadData(infile, outfile):
            P.concatenateAndLoad(infiles, outfile)

    Upload is performed via the :doc:`csv2db` script.

    Arguments
    ---------
    infiles : list
        Filenames of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile`.
    regex_filename : string
        If given, *regex_filename* is applied to the filename to extract
        the track name. If the pattern contains multiple groups, they are
        added as additional columns. For example, if `cat` is set to
        ``track,method`` and `regex_filename` is ``(.*)_(.*).tsv.gz``
        it will add the columns ``track`` and method to the table.
    header : string
        Comma-separated list of values for header.
    cat : string
        Column title for column containing the track name. The track name
        is derived from the filename, see `regex_filename`.
    has_titles : bool
        If True, files are expected to have column titles in their first row.
    missing_value : string
        String to use for missing values.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    tablename: string
        Name to use for table. If unset derive from outfile.
    options : string
        Command line options for the `csv2db.py` script.
    job_memory : string
        Amount of memory to allocate for job. If unset, uses the global
        default.

    """
    if job_memory is None:
        job_memory = get_params()["cluster_memory_default"]

    if tablename is None:
        tablename = to_table(outfile)

    infiles = " ".join(infiles)

    passed_options = options
    load_options, cat_options = ["--add-index=track"], []

    if regex_filename:
        cat_options.append("--regex-filename='%s'" % regex_filename)

    if header:
        load_options.append("--header-names=%s" % header)

    if not has_titles:
        cat_options.append("--no-titles")

    cat_options = " ".join(cat_options)
    load_options = " ".join(load_options) + " " + passed_options

    load_statement = build_load_statement(tablename,
                                          options=load_options,
                                          retry=retry)

    statement = '''python -m cgatcore.tables
    --cat=%(cat)s
    --missing-value=%(missing_value)s
    %(cat_options)s
    %(infiles)s
    | %(load_statement)s
    > %(outfile)s'''

    to_cluster = False
    run(statement)
Beispiel #18
0
    def build_job_script(self, statement):
        '''build job script from statement.

        returns (name_of_script, stdout_path, stderr_path)
        '''
        tmpfilename = get_temp_filename(dir=self.work_dir, clear=True)
        tmpfilename = tmpfilename + ".sh"

        expanded_statement, cleanup_funcs = self.expand_statement(statement)

        with open(tmpfilename, "w") as tmpfile:
            # disabled: -l -O expand_aliases\n" )

            # make executable
            tmpfile.write("#!/bin/bash -eu\n")
            if not self.ignore_pipe_errors:
                tmpfile.write("set -o pipefail\n")

            os.chmod(tmpfilename, stat.S_IRWXG | stat.S_IRWXU)

            tmpfile.write("\ncd {}\n".format(self.work_dir))
            if self.output_directories is not None:
                for outdir in self.output_directories:
                    if outdir:
                        tmpfile.write("\nmkdir -p {}\n".format(outdir))

            # create and set system scratch dir for temporary files
            tmpfile.write("umask 002\n")

            cluster_tmpdir = get_params()["cluster_tmpdir"]

            if self.run_on_cluster and cluster_tmpdir:
                tmpdir = cluster_tmpdir
                tmpfile.write("TMPDIR=`mktemp -d -p {}`\n".format(tmpdir))
                tmpfile.write("export TMPDIR\n")
            else:
                tmpdir = get_temp_dir(dir=get_params()["tmpdir"], clear=True)
                tmpfile.write("mkdir -p {}\n".format(tmpdir))
                tmpfile.write("export TMPDIR={}\n".format(tmpdir))

            cleanup_funcs.append(
                ("clean_temp", "{{ rm -rf {}; }}".format(tmpdir)))

            # output times whenever script exits, preserving
            # return status
            cleanup_funcs.append(
                ("info", "{ echo 'benchmark'; hostname; times; }"))
            for cleanup_func, cleanup_code in cleanup_funcs:
                tmpfile.write("\n{}() {}\n".format(cleanup_func, cleanup_code))

            tmpfile.write("\nclean_all() {{ {}; }}\n".format("; ".join(
                [x[0] for x in cleanup_funcs])))

            tmpfile.write("\ntrap clean_all EXIT\n\n")

            if self.job_memory not in("unlimited", "etc") and \
               self.options.get("cluster_memory_ulimit", False):
                # restrict virtual memory
                # Note that there are resources in SGE which could do this directly
                # such as v_hmem.
                # Note that limiting resident set sizes (RSS) with ulimit is not
                # possible in newer kernels.
                # -v and -m accept memory in kb
                requested_memory_kb = max(
                    1000,
                    int(
                        math.ceil(
                            iotools.human2bytes(self.job_memory) / 1024 *
                            self.job_threads)))
                # unsetting error exit as often not permissions
                tmpfile.write("set +e\n")
                tmpfile.write(
                    "ulimit -v {} > /dev/null \n".format(requested_memory_kb))
                tmpfile.write(
                    "ulimit -m {} > /dev/null \n".format(requested_memory_kb))
                # set as hard limit
                tmpfile.write("ulimit -H -v > /dev/null \n")
                tmpfile.write("set -e\n")

            if self.shellfile:

                # make sure path exists that we want to write to
                tmpfile.write("mkdir -p $(dirname \"{}\")\n".format(
                    self.shellfile))

                # output low-level debugging information to a shell log file
                tmpfile.write('echo "%s : START -> %s" >> %s\n' %
                              (self.job_name, tmpfilename, self.shellfile))
                # disabled - problems with quoting
                # tmpfile.write( '''echo 'statement=%s' >> %s\n''' %
                # (shellquote(statement), self.shellfile) )
                tmpfile.write("set | sed 's/^/%s : /' >> %s\n" %
                              (self.job_name, self.shellfile))
                tmpfile.write("pwd | sed 's/^/%s : /' >> %s\n" %
                              (self.job_name, self.shellfile))
                tmpfile.write("hostname | sed 's/^/%s: /' >> %s\n" %
                              (self.job_name, self.shellfile))
                # cat /proc/meminfo is Linux specific
                if get_params()['os'] == 'Linux':
                    tmpfile.write(
                        "cat /proc/meminfo | sed 's/^/%s: /' >> %s\n" %
                        (self.job_name, self.shellfile))
                elif get_params()['os'] == 'Darwin':
                    tmpfile.write("vm_stat | sed 's/^/%s: /' >> %s\n" %
                                  (self.job_name, self.shellfile))
                tmpfile.write('echo "%s : END -> %s" >> %s\n' %
                              (self.job_name, tmpfilename, self.shellfile))
                tmpfile.write("ulimit | sed 's/^/%s: /' >> %s\n" %
                              (self.job_name, self.shellfile))

            job_path = os.path.abspath(tmpfilename)

            tmpfile.write(expanded_statement)
            tmpfile.write("\n\n")
            tmpfile.close()

        return statement, job_path
Beispiel #19
0
 def creator():
     conn = sqlite3.connect(filename)
     conn.execute("ATTACH DATABASE '{}' as annotations".format(
         os.path.join(get_params()["annotations_dir"], "csvdb")))
     return conn
Beispiel #20
0
def submit(module,
           function,
           args=None,
           infiles=None,
           outfiles=None,
           to_cluster=True,
           logfile=None,
           job_options="",
           job_threads=1,
           job_memory=False):
    '''submit a python *function* as a job to the cluster.

    This method runs the script :file:`run_function` using the
    :func:`run` method in this module thus providing the same
    control options as for command line tools.

    Arguments
    ---------
    module : string
        Module name that contains the function. If `module` is
        not part of the PYTHONPATH, an absolute path can be given.
    function : string
        Name of function to execute
    infiles : string or list
        Filenames of input data
    outfiles : string or list
        Filenames of output data
    logfile : filename
        Logfile to provide to the ``--log`` option
    job_options : string
        String for generic job options for the queuing system
    job_threads : int
        Number of slots (threads/cores/CPU) to use for the task
    job_memory : string
        Amount of memory to reserve for the job.

    '''

    if not job_memory:
        job_memory = get_params().get("cluster_memory_default", "2G")

    if type(infiles) in (list, tuple):
        infiles = " ".join(["--input=%s" % x for x in infiles])
    else:
        infiles = "--input=%s" % infiles

    if type(outfiles) in (list, tuple):
        outfiles = " ".join(["--output-section=%s" % x for x in outfiles])
    else:
        outfiles = "--output-section=%s" % outfiles

    if logfile:
        logfile = "--log=%s" % logfile
    else:
        logfile = ""

    if args:
        args = "--args=%s" % ",".join(args)
    else:
        args = ""

    statement = ("python -m cgatcore.pipeline.run_function "
                 "--module=%(module)s "
                 "--function=%(function)s "
                 "%(logfile)s "
                 "%(infiles)s "
                 "%(outfiles)s "
                 "%(args)s")
    run(statement)
Beispiel #21
0
def run(statement, **kwargs):
    """run a command line statement.

    This function runs a single or multiple statements either locally
    or on the cluster using drmaa. How a statement is executed or how
    it is modified depends on the context.

    The context is provided by keyword arguments provided as named
    function arguments ('kwargs') but also from defaults (see
    below). The following keyword arguments are recognized:

    job_memory
        memory to use for the job per thread. Memory specification should be in a
        format that is accepted by the job scheduler. Note that memory
        is per thread. If you have 6 threads and the total memory is
        6Gb, use 1G as job_memory.
    job_total_memory
        total memory to use for a job. This will be divided by the number of
        threads.
    job_threads
        number of threads to request for the job.
    job_options
        options to the job scheduler.
    job_condaenv
        conda environment to use for the job.
    job_array
        if set, run statement as an array job. Job_array should be
        tuple with start, end, and increment.

    In addition, any additional variables will be used to interpolate
    the command line string using python's '%' string interpolation
    operator.

    The context is build in a hierarchical manner with successive
    operations overwriting previous values.

    1. Global variables
       The context is initialized
       with system-wide defaults stored in the global PARAMS
       singleton.
    2. Context of caller
       The context of the calling function is examined
       and any local variables defined in this context are added.
    3. kwargs
       Any options given explicitely as options to the run() method
       are added.
    4. params
       If the context of the calling function contains a params
       variable, its contents are added to the context. This permits
       setting variables in configuration files in TaskLibrary
       functions.

    By default, a job is sent to the cluster, unless:

        * ``to_cluster`` is present and set to None.

        * ``without_cluster`` is True.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``. Note that qrsh/qsub directly
          still works.

    The job will be executed within PARAMS["work_dir"], unless
    PARAMS["work_dir"] is not local. In that case, the job will
    be executed in a shared temporary directory.

    Arguments
    ---------
    statement : string or list of strings
        A command line statement or a list of command line statements
        to be executed.
    kwargs : dictionary
        Context for job. The context is used to interpolate the command
        line statement.

    """
    logger = get_logger()

    # combine options using priority
    options = dict(list(get_params().items()))
    caller_options = get_caller_locals()
    options.update(list(caller_options.items()))

    if "self" in options:
        del options["self"]
    options.update(list(kwargs.items()))

    # inject params named tuple from TaskLibrary functions into option
    # dict. This allows overriding options set in the code with options set
    # in a .yml file
    if "params" in options:
        try:
            options.update(options["params"]._asdict())
        except AttributeError:
            pass

    # insert parameters supplied through simplified interface such
    # as job_memory, job_options, job_queue
    options['cluster']['options'] = options.get('job_options',
                                                options['cluster']['options'])
    options['cluster']['queue'] = options.get('job_queue',
                                              options['cluster']['queue'])
    options['without_cluster'] = options.get('without_cluster')

    # SGE compatible job_name
    name_substrate = str(options.get("outfile", "cgatcore"))
    if os.path.basename(name_substrate).startswith("result"):
        name_substrate = os.path.basename(os.path.dirname(name_substrate))
    else:
        name_substrate = os.path.basename(name_substrate)

    options["job_name"] = re.sub("[:]", "_", name_substrate)
    try:
        calling_module = get_caller().__name__
    except AttributeError:
        calling_module = "unknown"

    options["task_name"] = calling_module + "." + get_calling_function()

    # build statements using parameter interpolation
    if isinstance(statement, list):
        statement_list = []
        for stmt in statement:
            statement_list.append(interpolate_statement(stmt, options))
    else:
        statement_list = [interpolate_statement(statement, options)]

    if len(statement_list) == 0:
        logger.warn("no statements found - no execution")
        return []

    if options.get("dryrun", False):
        for statement in statement_list:
            logger.info("dry-run: {}".format(statement))
        return []

    # execute statement list
    runner = make_runner(**options)
    with runner as r:
        benchmark_data = r.run(statement_list)

    # log benchmark_data
    for data in benchmark_data:
        logger.info(json.dumps(data))

    BenchmarkData = collections.namedtuple('BenchmarkData',
                                           sorted(benchmark_data[0]))
    return [BenchmarkData(**d) for d in benchmark_data]
Beispiel #22
0
def load(infile,
         outfile=None,
         options="",
         collapse=False,
         transpose=False,
         tablename=None,
         retry=True,
         limit=0,
         shuffle=False,
         job_memory=None):
    """import data from a tab-separated file into database.

    The table name is given by outfile without the
    ".load" suffix.

    A typical load task in ruffus would look like this::

        @transform("*.tsv.gz", suffix(".tsv.gz"), ".load")
        def loadData(infile, outfile):
            P.load(infile, outfile)

    Upload is performed via the :doc:`csv2db` script.

    Arguments
    ---------
    infile : string
        Filename of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile` if `tablename` is not set.
    options : string
        Command line options for the `csv2db.py` script.
    collapse : string
        If set, the table will be collapsed before loading. This
        transforms a data set with two columns where the first column
        is the row name into a multi-column table.  The value of
        collapse is the value used for missing values.
    transpose : string
        If set, the table will be transposed before loading. The first
        column in the first row will be set to the string within
        transpose.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    limit : int
        If set, only load the first n lines.
    shuffle : bool
        If set, randomize lines before loading. Together with `limit`
        this permits loading a sample of rows.
    job_memory : string
        Amount of memory to allocate for job. If unset, uses the global
        default.
    """

    if job_memory is None:
        job_memory = get_params()["cluster_memory_default"]

    if not tablename:
        tablename = to_table(outfile)

    statement = []

    if infile.endswith(".gz"):
        statement.append("zcat %(infile)s")
    else:
        statement.append("cat %(infile)s")

    if collapse:
        statement.append("python -m cgatcore.table "
                         "--log=%(outfile)s.collapse.log "
                         "--collapse=%(collapse)s")

    if transpose:
        statement.append("python -m cgatcore.table "
                         "--log=%(outfile)s.transpose.log "
                         "--transpose "
                         "--set-transpose-field=%(transpose)s")

    if shuffle:
        statement.append("python -m cgatcore.table "
                         "--log=%(outfile)s.shuffle.log "
                         "--method=randomize-rows")

    if limit > 0:
        # use awk to filter in order to avoid a pipeline broken error from head
        statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1))
        # ignore errors from cat or zcat due to broken pipe
        ignore_pipe_errors = True

    statement.append(
        build_load_statement(tablename, options=options, retry=retry))

    statement = " | ".join(statement) + " > %(outfile)s"

    to_cluster = False
    run(statement)