Example #1
0
    print E.GetParams()

    if param_trans:
        parser = PredictionParser.PredictionParserBlatTrans()
    else:
        parser = PredictionParser.PredictionParserBlatCDNA()

    nmatches = 1
    for line in sys.stdin:
        if line[0] == "#": continue
        if not re.match("^[0-9]", line): continue

        try:
            entries = parser.Parse((line, ))
        except PredictionParser.AlignmentError, e:
            print "# %s" % str(e)
            print "#", line[:-1]
            sys.exit(1)

        for entry in entries:
            entry.mPredictionId = nmatches
            nmatches += 1

        print str(entries)

    print E.GetFooter()


if __name__ == "__main__":
    sys.exit(main(sys.argv))
Example #2
0
def main(args=sys.argv):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import CGAT.Pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    args : list
        List of command line arguments.

    """

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS

    parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--pipeline-action",
                      dest="pipeline_action",
                      type="choice",
                      choices=("make", "show", "plot", "dump", "config",
                               "clone", "check", "regenerate", "printconfig"),
                      help="action to take [default=%default].")

    parser.add_option("--pipeline-format",
                      dest="pipeline_format",
                      type="choice",
                      choices=("dot", "jpg", "svg", "ps", "png"),
                      help="pipeline format [default=%default].")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="perform a dry run (do not execute any shell "
                      "commands) [default=%default].")

    parser.add_option("-f",
                      "--force-output",
                      dest="force",
                      action="store_true",
                      help="force running the pipeline even if there "
                      "are uncommited changes "
                      "in the repository [default=%default].")

    parser.add_option("-p",
                      "--multiprocess",
                      dest="multiprocess",
                      type="int",
                      help="number of parallel processes to use on "
                      "submit host "
                      "(different from number of jobs to use for "
                      "cluster jobs) "
                      "[default=%default].")

    parser.add_option("-e",
                      "--exceptions",
                      dest="log_exceptions",
                      action="store_true",
                      help="echo exceptions immediately as they occur "
                      "[default=%default].")

    parser.add_option("-i",
                      "--terminate",
                      dest="terminate",
                      action="store_true",
                      help="terminate immediately at the first exception "
                      "[default=%default].")

    parser.add_option("-d",
                      "--debug",
                      dest="debug",
                      action="store_true",
                      help="output debugging information on console, "
                      "and not the logfile "
                      "[default=%default].")

    parser.add_option("-s",
                      "--set",
                      dest="variables_to_set",
                      type="string",
                      action="append",
                      help="explicitly set paramater values "
                      "[default=%default].")

    parser.add_option("-c",
                      "--checksums",
                      dest="ruffus_checksums_level",
                      type="int",
                      help="set the level of ruffus checksums"
                      "[default=%default].")

    parser.add_option("-t",
                      "--is-test",
                      dest="is_test",
                      action="store_true",
                      help="this is a test run"
                      "[default=%default].")

    parser.add_option("--rabbitmq-exchange",
                      dest="rabbitmq_exchange",
                      type="string",
                      help="RabbitMQ exchange to send log messages to "
                      "[default=%default].")

    parser.add_option("--rabbitmq-host",
                      dest="rabbitmq_host",
                      type="string",
                      help="RabbitMQ host to send log messages to "
                      "[default=%default].")

    parser.add_option("--input-validation",
                      dest="input_validation",
                      action="store_true",
                      help="perform input validation before starting "
                      "[default=%default].")

    parser.set_defaults(pipeline_action=None,
                        pipeline_format="svg",
                        pipeline_targets=[],
                        multiprocess=40,
                        logfile="pipeline.log",
                        dry_run=False,
                        force=False,
                        log_exceptions=False,
                        exceptions_terminate_immediately=False,
                        debug=False,
                        variables_to_set=[],
                        is_test=False,
                        ruffus_checksums_level=0,
                        rabbitmq_host="saruman",
                        rabbitmq_exchange="ruffus_pipelines",
                        input_validation=False)

    (options, args) = E.Start(parser, add_cluster_options=True)

    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    E.info("Started in: %s" % PARAMS.get("workingdir"))
    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.

    PARAMS["dryrun"] = options.dry_run
    PARAMS["input_validation"] = options.input_validation

    # use cli_cluster_* keys in PARAMS to ensure highest priority
    # of cluster_* options passed with the command-line
    if options.cluster_memory_default is not None:
        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
        PARAMS["cluster_memory_default"] = options.cluster_memory_default
    if options.cluster_memory_resource is not None:
        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
    if options.cluster_num_jobs is not None:
        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
    if options.cluster_options is not None:
        PARAMS["cli_cluster_options"] = options.cluster_options
        PARAMS["cluster_options"] = options.cluster_options
    if options.cluster_parallel_environment is not None:
        PARAMS[
            "cli_cluster_parallel_environment"] = options.cluster_parallel_environment
        PARAMS[
            "cluster_parallel_environment"] = options.cluster_parallel_environment
    if options.cluster_priority is not None:
        PARAMS["cli_cluster_priority"] = options.cluster_priority
        PARAMS["cluster_priority"] = options.cluster_priority
    if options.cluster_queue is not None:
        PARAMS["cli_cluster_queue"] = options.cluster_queue
        PARAMS["cluster_queue"] = options.cluster_queue
    if options.cluster_queue_manager is not None:
        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager

    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level

    for variables in options.variables_to_set:
        variable, value = variables.split("=")
        PARAMS[variable.strip()] = IOTools.str2val(value.strip())

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    # see inputValidation function in Parameters.py
    if options.input_validation:
        inputValidation(PARAMS, sys.argv[0])

    if options.pipeline_action == "check":
        counter, requirements = Requirements.checkRequirementsFromAllModules()
        for requirement in requirements:
            E.info("\t".join(map(str, requirement)))
        E.info("version check summary: %s" % str(counter))
        E.Stop()
        return

    elif options.pipeline_action == "debug":
        # create the session proxy
        startSession()

        method_name = options.pipeline_targets[0]
        caller = getCaller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "svg", "plot", "touch",
                                     "regenerate"):

        # set up extra file logger
        handler = logging.FileHandler(filename=options.logfile, mode="a")
        handler.setFormatter(
            MultiLineFormatter(
                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'
            ))
        logger = logging.getLogger()
        logger.addHandler(handler)
        messenger = None

        try:
            if options.pipeline_action == "make":

                # get tasks to be done. This essentially replicates
                # the state information within ruffus.
                stream = io.StringIO()
                pipeline_printout(
                    stream,
                    options.pipeline_targets,
                    verbose=5,
                    checksum_level=options.ruffus_checksums_level)

                messenger = LoggingFilterRabbitMQ(
                    stream.getvalue(),
                    project_name=getProjectName(),
                    pipeline_name=getPipelineName(),
                    host=options.rabbitmq_host,
                    exchange=options.rabbitmq_exchange)

                logger.addFilter(messenger)

                if not options.without_cluster and HAS_DRMAA:
                    global task
                    # use threading instead of multiprocessing in order to
                    # limit the number of concurrent jobs by using the
                    # GIL
                    #
                    # Note that threading might cause problems with rpy.
                    task.Pool = ThreadPool

                    # create the session proxy
                    startSession()

                elif not options.without_cluster and not HAS_DRMAA:
                    E.critical(
                        "DRMAA API not found so cannot talk to a cluster.")
                    E.critical("Please use --local to run the pipeline"
                               " on this host: {}".format(os.uname()[1]))
                    sys.exit(-1)

                #
                #   make sure we are not logging at the same time in
                #   different processes
                #
                # session_mutex = manager.Lock()
                E.info(E.GetHeader())
                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
                E.info("Working directory is: %s" % PARAMS["workingdir"])

                pipeline_run(
                    options.pipeline_targets,
                    multiprocess=options.multiprocess,
                    logger=logger,
                    verbose=options.loglevel,
                    log_exceptions=options.log_exceptions,
                    exceptions_terminate_immediately=options.
                    exceptions_terminate_immediately,
                    checksum_level=options.ruffus_checksums_level,
                )

                E.info(E.GetFooter())

                closeSession()

            elif options.pipeline_action == "show":
                pipeline_printout(
                    options.stdout,
                    options.pipeline_targets,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "touch":
                pipeline_run(options.pipeline_targets,
                             touch_files_only=True,
                             verbose=options.loglevel,
                             checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "regenerate":
                pipeline_run(options.pipeline_targets,
                             touch_files_only=options.ruffus_checksums_level,
                             verbose=options.loglevel)

            elif options.pipeline_action == "svg":
                pipeline_printout_graph(
                    options.stdout.buffer,
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "plot":
                outf, filename = tempfile.mkstemp()
                pipeline_printout_graph(
                    os.fdopen(outf, "wb"),
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)
                execute("inkscape %s" % filename)
                os.unlink(filename)

        except ruffus_exceptions.RethrownJobError as value:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(value.args))
                for idx, e in enumerate(value.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                        pass
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub("\s", "", job)

                    if messenger:
                        messenger.send_error(task, job, error, msg)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.logfile)

                # write full traceback to log file only by removing the stdout
                # handler
                lhStdout = logger.handlers[0]
                logger.removeHandler(lhStdout)
                logger.error("start of error messages")
                logger.error(value)
                logger.error("end of error messages")
                logger.addHandler(lhStdout)

                # raise error
                raise ValueError("pipeline failed with %i errors" %
                                 len(value.args))
            else:
                raise

    elif options.pipeline_action == "dump":
        print(json.dumps(PARAMS))

    elif options.pipeline_action == "printconfig":
        print("Printing out pipeline parameters: ")
        for k in sorted(PARAMS):
            print(k, "=", PARAMS[k])
        printConfigFiles()

    elif options.pipeline_action == "config":
        f = sys._getframe(1)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        writeConfigFiles(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clonePipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.Stop()
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    try:
        optlist, args = getopt.getopt(sys.argv[1:], param_short_options,
                                      param_long_options)

    except getopt.error as msg:
        print(globals()["__doc__"], msg)
        sys.exit(1)

    for o, a in optlist:
        if o in ("--help", ):
            print(globals()["__doc__"])
            sys.exit(0)
        elif o in ("--version", ):
            print("version=")
            sys.exit(0)
        elif o in ("-h", "--header-names"):
            param_headers = a.split(",")
        elif o in ("-n", "--normalize"):
            param_normalize = 1
        elif o in ("-m", "--missing-value"):
            param_missing_value = a
        elif o == "--no-titles":
            param_titles = False
        elif o == "--no-titles":
            param_titles = False
        elif o in ("-f", "--format"):
            param_format = a
        elif o == "--format-value":
            param_format_value = a
        elif o == "--bin-format":
            param_format_bin = a
        elif o in ("-s", "--method=sort --sort-order"):
            if a in ("numerical", "alphabetic"):
                param_sort = a
            else:
                param_sort = a.split(",")

    if len(args) < 1:
        print(globals()["__doc__"], "please specify at one histogram.")
        sys.exit(1)

    param_filenames = args

    print(E.GetHeader())
    print(E.GetParams())

    histograms = []

    # first
    headers = [
        'bin',
    ]
    if param_headers and headers != "auto":
        headers = [
            param_headers[0],
        ]
        del param_headers[0]

    for x in range(len(param_filenames)):

        filename = param_filenames[x]
        if not os.path.exists(filename):
            print("# skipped because file not present: %s" % filename)
            continue

        file = IOTools.openFile(filename, "r")

        lines = [x for x in file if x[0] != "#"]

        if len(lines) == 0:
            continue

        if param_titles:
            h = lines[0][:-1].split("\t")[1:]
            del lines[0]

        if param_headers == "auto":
            headers.append(os.path.basename(filename))
        elif param_headers:
            headers.append(param_headers[x])
        elif param_titles:
            headers += h

        data = [list(map(float, x[:-1].split("\t"))) for x in lines]

        # add empty data point for empty histograms
        if len(data) == 0:
            data = [(0, 0)]

        histograms.append(data)

    # sort the whole thing:
    if param_sort:
        sort_order = []

        if param_sort == "numerical":
            t = list(
                zip(list(map(int, headers[1:])),
                    list(range(1,
                               len(headers) + 1))))
            t.sort()

            for tt in t:
                sort_order.append(headers[tt[1]])

        elif param_sort == "alphabetical":
            t = list(zip(headers[1:], list(range(1, len(headers) + 1))))
            t.sort()

            for tt in t:
                sort_order.append(headers[tt[1]])
        else:
            sort_order = param_sort

        # map header to old position
        map_header2pos = {}
        for x in range(1, len(headers)):
            map_header2pos[headers[x]] = x

        order = []
        for x in sort_order:
            if x in map_header2pos:
                order.append(map_header2pos[x])

        new_headers = [headers[0]]
        new_histograms = []

        for x in order:
            new_headers.append(headers[x])
            new_histograms.append(histograms[x - 1])

        histograms = new_histograms
        headers = new_headers

    combined_histogram = Histogram.Combine(histograms, param_missing_value)

    if headers:
        print("\t".join(headers))

    if param_normalize:
        combined_histogram = Histogram.Normalize(combined_histogram)

    Histogram.Print(
        combined_histogram,
        format_bin=param_format_bin,
        format_value=param_format_value,
    )

    print(E.GetFooter())
Example #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    param_long_options = [
        "verbose=", "help", "split-regex=", "after", "pattern-output=", "skip",
        "column=", "map=", "dry-run", "header", "remove-key", "append",
        "pattern-identifier=", "version", "chunk-size="
    ]

    param_short_options = "v:hr:ap:sc:dek"

    param_loglevel = 1
    param_split_at_regex = None
    param_after = None
    param_skip = None
    param_pattern_output = "%s.chunk"
    param_split_column = None
    param_filename_map = None
    param_dry_run = False
    param_header = False
    param_remove_key = False
    param_append = "w"
    param_pattern_identifier = None
    param_chunk_size = 1

    try:
        optlist, args = getopt.getopt(sys.argv[1:], param_short_options,
                                      param_long_options)

    except getopt.error as msg:
        print(USAGE, msg)
        sys.exit(1)

    for o, a in optlist:
        if o in ("-v", "--verbose"):
            param_loglevel = int(a)
        elif o in ("--version", ):
            print("version=")
            sys.exit(0)
        elif o in ("-h", "--help"):
            print(USAGE)
            sys.exit(0)
        elif o in ("-r", "--split-regex"):
            param_split_at_regex = re.compile(a)
        elif o in ("-a", "--after"):
            param_after = 1
        elif o in ("-s", "--skip"):
            param_skip = 1
        elif o in ("-p", "--pattern-output"):
            param_pattern_output = a
        elif o in ("-c", "--column"):
            param_split_column = int(a) - 1
        elif o in ("-m", "--map"):
            param_filename_map = a
        elif o in ("-d", "--dry-run"):
            param_dry_run = True
        elif o in ("-e", "--header-names"):
            param_header = True
        elif o in ("-r", "--remove-key"):
            param_remove_key = True
        elif o == "--append":
            param_append = "a"
        elif o == "--pattern-identifier":
            param_pattern_identifier = re.compile(a)
        elif o == "--chunk-size":
            param_chunk_size = int(a)

    print(E.GetHeader())
    print(E.GetParams())

    mymap = {}
    if param_filename_map:
        infile = IOTools.openFile(param_filename_map, "r")
        for line in infile:
            if line[0] == "#":
                continue
            data = line[:-1].split("\t")[:2]
            mymap[data[0]] = data[1]

    filenames = set()
    found = set()
    ninput, noutput = 0, 0

    if param_split_column is not None:

        header = None
        files = {}
        for line in sys.stdin:

            if line[0] == "#":
                continue

            ninput += 1

            if param_header:
                if not header:
                    header = line[:-1]
                    continue
            else:
                header = None

            data = line[:-1].split("\t")

            try:
                key = data[param_split_column]
            except ValueError:
                continue

            if param_pattern_identifier:
                key = param_pattern_identifier.search(key).groups()[0]

            if mymap:
                if key in mymap:
                    key = mymap[key]
                else:
                    continue

            found.add(key)

            filename = re.sub("%s", key, param_pattern_output)
            filenames.add(filename)

            if filename not in files:

                # reset if too many files are open
                if len(files) > 1000:
                    if param_loglevel >= 1:
                        print("# resetting all files.")
                        sys.stdout.flush()

                    for f in list(files.values()):
                        f.close()
                    files = {}

                files[filename] = CreateOpen(filename, "a", param_dry_run,
                                             header)

            if param_remove_key:
                del data[param_split_column]
                files[filename].write(string.join(data, "\t") + "\n")
            else:
                files[filename].write(line)

            noutput += 1

        for f in list(files.values()):
            f.close()

    else:
        file_id = 0

        filename = re.sub("%s", str(file_id), param_pattern_output)
        outfile = CreateOpen(filename, param_append, param_dry_run)
        nlines = 0

        header = param_header
        split = 0

        for line in sys.stdin:

            if param_split_at_regex and param_split_at_regex.search(line[:-1]):
                split += 1

            if split == param_chunk_size:
                if param_after:
                    nlines += 1
                    outfile.write(line)
                if nlines > 0:
                    outfile.close()
                    file_id += 1
                    filename = re.sub("%s", str(file_id), param_pattern_output)
                    outfile = CreateOpen(filename, param_append, param_dry_run,
                                         header)
                    filenames.add(filename)
                    split = 0

                nlines = 0
                if param_after or param_skip:
                    continue

            outfile.write(line)
            nlines += 1

        outfile.close()

    if param_loglevel >= 1:
        sys.stdout.write(
            "# ninput=%i, noutput=%i, nfound=%i, nnotfound=%i, nfiles=%i\n" %
            (ninput, noutput, len(found), len(
                set(mymap).difference(found)), len(filenames)))

    print(E.GetFooter())