def cluster_job(config, job_id, more=False): setproctitle('%s %s' % (job_id, config.name)) proxy_port = 13000 + config.instance hostname_no_instance = config.name.split(':')[0] nice = compmake_config.cluster_nice #@UndefinedVariable compmake_cmd = \ 'nice -n %d compmake --hostname=%s --db=redis --redis_events --redis_host=localhost:%s --slave %s --save_progress=False\ make_single more=%s %s' % \ (nice, hostname_no_instance, proxy_port, get_namespace(), more, job_id) redis_host = RedisInterface.host redis_port = RedisInterface.port if config.username: connection_string = '%s@%s' % (config.username, config.host) else: connection_string = config.host # TODO: make additional switches configurable args = ['ssh', connection_string, '-X', '-R', '%s:%s:%s' % (proxy_port, redis_host, redis_port), '%s' % compmake_cmd] if compmake_config.cluster_show_cmd: #@UndefinedVariable print " ".join(args) PIPE = subprocess.PIPE p = subprocess.Popen(args, stdout=PIPE, stdin=PIPE, stderr=PIPE) ret = p.wait() if ret == RET_CODE_JOB_FAILED: raise JobFailed('Job %s failed' % job_id) if ret != 0: raise HostFailed('Job %s: host failed: (line: "%s", ret=%s)' % (job_id, " ".join(args), ret)) return ret
def make(job_id, more=False): """ Makes a single job. Returns the user-object or raises JobFailed """ host = compmake_config.hostname #@UndefinedVariable setproctitle(job_id) # TODO: should we make sure we are up to date??? up, reason = up_to_date(job_id) #@UnusedVariable cache = get_job_cache(job_id) want_more = cache.state == Cache.MORE_REQUESTED if up and not (more and want_more): # print "%s is up to date" % job_id assert is_job_userobject_available(job_id) return get_job_userobject(job_id) else: # if up and (more and want_more): # XXX review the logic # reason = 'want more' # print "Making %s (%s)" % (job_id, reason) computation = get_job(job_id) assert(cache.state in [Cache.NOT_STARTED, Cache.IN_PROGRESS, Cache.MORE_REQUESTED, Cache.DONE, Cache.FAILED]) if cache.state == Cache.NOT_STARTED: previous_user_object = None cache.state = Cache.IN_PROGRESS if cache.state == Cache.FAILED: previous_user_object = None cache.state = Cache.IN_PROGRESS elif cache.state == Cache.IN_PROGRESS: if is_job_tmpobject_available(job_id): previous_user_object = get_job_tmpobject(job_id) else: previous_user_object = None elif cache.state == Cache.MORE_REQUESTED: assert(is_job_userobject_available(job_id)) if is_job_tmpobject_available(job_id): # resuming more computation previous_user_object = get_job_tmpobject(job_id) else: # starting more computation previous_user_object = get_job_userobject(job_id) elif cache.state == Cache.DONE: # If we are done, it means children have been updated assert(not up) previous_user_object = None else: assert(False) # update state cache.time_start = time() cpu_start = clock() set_job_cache(job_id, cache) def progress_callback(stack): publish('job-progress-plus', job_id=job_id, host=host, stack=stack) init_progress_tracking(progress_callback) num, total = 0, None user_object = None capture = OutputCapture(prefix=job_id, echo_stdout=compmake_config.echo_stdout, #@UndefinedVariable echo_stderr=compmake_config.echo_stderr) #@UndefinedVariable try: result = computation.compute(previous_user_object) if type(result) == GeneratorType: try: while True: next = result.next() if isinstance(next, tuple): if len(next) != 3: raise CompmakeException('If computation yields a tuple, ' + 'should be a tuple with 3 elemnts.' + 'Got: %s' % str(next)) user_object, num, total = next publish('job-progress', job_id=job_id, host=host, done=None, progress=num, goal=total) if compmake_config.save_progress: #@UndefinedVariable set_job_tmpobject(job_id, user_object) except StopIteration: pass else: publish('job-progress', job_id=job_id, host='XXX', done=1, progress=1, goal=1) user_object = result except KeyboardInterrupt: # TODO: clear progress cache # Save the current progress: cache.iterations_in_progress = num cache.iterations_goal = total if user_object: set_job_tmpobject(job_id, user_object) set_job_cache(job_id, cache) # clear progress cache publish('job-interrupted', job_id=job_id, host=host) raise JobInterrupted('Keyboard interrupt') except Exception as e: sio = StringIO() print_exc(file=sio) bt = sio.getvalue() error("Job %s failed: %s" % (job_id, e)) error(bt) mark_as_failed(job_id, e, bt) # clear progress cache publish('job-failed', job_id=job_id, host=host, reason=e) raise JobFailed('Job %s failed: %s' % (job_id, e)) finally: capture.deactivate() # even if we send an error, let's save the output of the process cache = get_job_cache(job_id) cache.captured_stderr = capture.stderr_replacement.buffer.getvalue() cache.captured_stdout = capture.stdout_replacement.buffer.getvalue() set_job_cache(job_id, cache) set_job_userobject(job_id, user_object) if is_job_tmpobject_available(job_id): # We only have one with yield delete_job_tmpobject(job_id) cache.state = Cache.DONE cache.timestamp = time() walltime = cache.timestamp - cache.time_start cputime = clock() - cpu_start # FIXME walltime/cputime not precise (especially for "more" computation) cache.walltime_used = walltime cache.cputime_used = cputime cache.done_iterations = num # XXX not true cache.host = compmake_config.hostname #@UndefinedVariable set_job_cache(job_id, cache) publish('job-succeeded', job_id=job_id, host=host) # TODO: clear these records in other place return user_object
def main(): setproctitle('compmake') parser = OptionParser(version=version) parser.add_option("--slave", action="store_true", default=False, dest="slave", help="[internal] Runs compmake in slave mode.") parser.add_option("--redis_events", action="store_true", default=False, dest="redis_events", help="[internal] Relays events using Redis.") config_populate_optparser(parser) (options, args) = parser.parse_args() initialize_backend() # We load plugins after we parsed the configuration from compmake import plugins #@UnusedImport if options.redis_events: if not compmake_config.db == 'redis': #@UndefinedVariable error('Cannot use redis_events without redis.') sys.exit(-2) from compmake.storage.redisdb import RedisInterface # register an handler that will capture all events def handler(event): RedisInterface.events_push(event) remove_all_handlers() register_handler("*", handler) if not options.slave: # XXX make sure this is the default set_compmake_status(compmake_status_interactive) # TODO: add command namespace # TODO: add command "load" if not args: user_error('I expect at least one parameter (module name)') sys.exit(-2) module_name = args[0] args = args[1:] if module_name.endswith('.py') or (module_name.find('/') > 0): warning('You passed a string "%s" which looks like a filename.' % module_name) module_name = module_name.replace('/', '.') module_name = module_name.replace('.py', '') warning('However, I need a module name. I will try with "%s".' % module_name) set_namespace(module_name) compmake.is_it_time = True try: __import__(module_name) except Exception as e: error('Error while trying to import module "%s": %s' % (module_name, e)) traceback.print_exc(file=sys.stderr) sys.exit(-5) # TODO: BUG: XXX: remove old jobs those in defined_this_section else: set_compmake_status(compmake_status_slave) if not args: user_error('I expect at least one parameter (namespace name)') sys.exit(-2) module_name = args.pop(0) set_namespace(module_name) if args: try: # XXX is this redudant? # compmake_config.interactive = False retcode = interpret_commands(args) # print "Exiting with retcode %s" % retcode sys.exit(retcode) except UserError as e: user_error(e) sys.exit(-6) else: retcode = interactive_console() sys.exit(retcode)