print E.GetParams() if param_trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) print E.GetFooter() if __name__ == "__main__": sys.exit(main(sys.argv))
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=("make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults(pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS[ "cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS[ "cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s' )) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster and HAS_DRMAA: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() elif not options.without_cluster and not HAS_DRMAA: E.critical( "DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run(options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run(options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError("pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv try: optlist, args = getopt.getopt(sys.argv[1:], param_short_options, param_long_options) except getopt.error as msg: print(globals()["__doc__"], msg) sys.exit(1) for o, a in optlist: if o in ("--help", ): print(globals()["__doc__"]) sys.exit(0) elif o in ("--version", ): print("version=") sys.exit(0) elif o in ("-h", "--header-names"): param_headers = a.split(",") elif o in ("-n", "--normalize"): param_normalize = 1 elif o in ("-m", "--missing-value"): param_missing_value = a elif o == "--no-titles": param_titles = False elif o == "--no-titles": param_titles = False elif o in ("-f", "--format"): param_format = a elif o == "--format-value": param_format_value = a elif o == "--bin-format": param_format_bin = a elif o in ("-s", "--method=sort --sort-order"): if a in ("numerical", "alphabetic"): param_sort = a else: param_sort = a.split(",") if len(args) < 1: print(globals()["__doc__"], "please specify at one histogram.") sys.exit(1) param_filenames = args print(E.GetHeader()) print(E.GetParams()) histograms = [] # first headers = [ 'bin', ] if param_headers and headers != "auto": headers = [ param_headers[0], ] del param_headers[0] for x in range(len(param_filenames)): filename = param_filenames[x] if not os.path.exists(filename): print("# skipped because file not present: %s" % filename) continue file = IOTools.openFile(filename, "r") lines = [x for x in file if x[0] != "#"] if len(lines) == 0: continue if param_titles: h = lines[0][:-1].split("\t")[1:] del lines[0] if param_headers == "auto": headers.append(os.path.basename(filename)) elif param_headers: headers.append(param_headers[x]) elif param_titles: headers += h data = [list(map(float, x[:-1].split("\t"))) for x in lines] # add empty data point for empty histograms if len(data) == 0: data = [(0, 0)] histograms.append(data) # sort the whole thing: if param_sort: sort_order = [] if param_sort == "numerical": t = list( zip(list(map(int, headers[1:])), list(range(1, len(headers) + 1)))) t.sort() for tt in t: sort_order.append(headers[tt[1]]) elif param_sort == "alphabetical": t = list(zip(headers[1:], list(range(1, len(headers) + 1)))) t.sort() for tt in t: sort_order.append(headers[tt[1]]) else: sort_order = param_sort # map header to old position map_header2pos = {} for x in range(1, len(headers)): map_header2pos[headers[x]] = x order = [] for x in sort_order: if x in map_header2pos: order.append(map_header2pos[x]) new_headers = [headers[0]] new_histograms = [] for x in order: new_headers.append(headers[x]) new_histograms.append(histograms[x - 1]) histograms = new_histograms headers = new_headers combined_histogram = Histogram.Combine(histograms, param_missing_value) if headers: print("\t".join(headers)) if param_normalize: combined_histogram = Histogram.Normalize(combined_histogram) Histogram.Print( combined_histogram, format_bin=param_format_bin, format_value=param_format_value, ) print(E.GetFooter())
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv param_long_options = [ "verbose=", "help", "split-regex=", "after", "pattern-output=", "skip", "column=", "map=", "dry-run", "header", "remove-key", "append", "pattern-identifier=", "version", "chunk-size=" ] param_short_options = "v:hr:ap:sc:dek" param_loglevel = 1 param_split_at_regex = None param_after = None param_skip = None param_pattern_output = "%s.chunk" param_split_column = None param_filename_map = None param_dry_run = False param_header = False param_remove_key = False param_append = "w" param_pattern_identifier = None param_chunk_size = 1 try: optlist, args = getopt.getopt(sys.argv[1:], param_short_options, param_long_options) except getopt.error as msg: print(USAGE, msg) sys.exit(1) for o, a in optlist: if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version", ): print("version=") sys.exit(0) elif o in ("-h", "--help"): print(USAGE) sys.exit(0) elif o in ("-r", "--split-regex"): param_split_at_regex = re.compile(a) elif o in ("-a", "--after"): param_after = 1 elif o in ("-s", "--skip"): param_skip = 1 elif o in ("-p", "--pattern-output"): param_pattern_output = a elif o in ("-c", "--column"): param_split_column = int(a) - 1 elif o in ("-m", "--map"): param_filename_map = a elif o in ("-d", "--dry-run"): param_dry_run = True elif o in ("-e", "--header-names"): param_header = True elif o in ("-r", "--remove-key"): param_remove_key = True elif o == "--append": param_append = "a" elif o == "--pattern-identifier": param_pattern_identifier = re.compile(a) elif o == "--chunk-size": param_chunk_size = int(a) print(E.GetHeader()) print(E.GetParams()) mymap = {} if param_filename_map: infile = IOTools.openFile(param_filename_map, "r") for line in infile: if line[0] == "#": continue data = line[:-1].split("\t")[:2] mymap[data[0]] = data[1] filenames = set() found = set() ninput, noutput = 0, 0 if param_split_column is not None: header = None files = {} for line in sys.stdin: if line[0] == "#": continue ninput += 1 if param_header: if not header: header = line[:-1] continue else: header = None data = line[:-1].split("\t") try: key = data[param_split_column] except ValueError: continue if param_pattern_identifier: key = param_pattern_identifier.search(key).groups()[0] if mymap: if key in mymap: key = mymap[key] else: continue found.add(key) filename = re.sub("%s", key, param_pattern_output) filenames.add(filename) if filename not in files: # reset if too many files are open if len(files) > 1000: if param_loglevel >= 1: print("# resetting all files.") sys.stdout.flush() for f in list(files.values()): f.close() files = {} files[filename] = CreateOpen(filename, "a", param_dry_run, header) if param_remove_key: del data[param_split_column] files[filename].write(string.join(data, "\t") + "\n") else: files[filename].write(line) noutput += 1 for f in list(files.values()): f.close() else: file_id = 0 filename = re.sub("%s", str(file_id), param_pattern_output) outfile = CreateOpen(filename, param_append, param_dry_run) nlines = 0 header = param_header split = 0 for line in sys.stdin: if param_split_at_regex and param_split_at_regex.search(line[:-1]): split += 1 if split == param_chunk_size: if param_after: nlines += 1 outfile.write(line) if nlines > 0: outfile.close() file_id += 1 filename = re.sub("%s", str(file_id), param_pattern_output) outfile = CreateOpen(filename, param_append, param_dry_run, header) filenames.add(filename) split = 0 nlines = 0 if param_after or param_skip: continue outfile.write(line) nlines += 1 outfile.close() if param_loglevel >= 1: sys.stdout.write( "# ninput=%i, noutput=%i, nfound=%i, nnotfound=%i, nfiles=%i\n" % (ninput, noutput, len(found), len( set(mymap).difference(found)), len(filenames))) print(E.GetFooter())