def do_launch(args): if not args.launchpad_file and os.path.exists( os.path.join(args.config_dir, 'my_launchpad.yaml')): args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml') elif not args.launchpad_file: args.launchpad_file = LAUNCHPAD_LOC if not args.fworker_file and os.path.exists( os.path.join(args.config_dir, 'my_fworker.yaml')): args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml') elif not args.fworker_file: args.fworker_file = FWORKER_LOC if not args.queueadapter_file and os.path.exists( os.path.join(args.config_dir, 'my_qadapter.yaml')): args.queueadapter_file = os.path.join(args.config_dir, 'my_qadapter.yaml') elif not args.queueadapter_file: args.queueadapter_file = QUEUEADAPTER_LOC launchpad = LaunchPad.from_file( args.launchpad_file) if args.launchpad_file else LaunchPad( strm_lvl=args.loglvl) fworker = FWorker.from_file( args.fworker_file) if args.fworker_file else FWorker() queueadapter = load_object_from_file(args.queueadapter_file) args.loglvl = 'CRITICAL' if args.silencer else args.loglvl if args.command == 'rapidfire': rapidfire(launchpad, fworker=fworker, qadapter=queueadapter, launch_dir=args.launch_dir, nlaunches=args.nlaunches, njobs_queue=args.maxjobs_queue, njobs_block=args.maxjobs_block, sleep_time=args.sleep, reserve=args.reserve, strm_lvl=args.loglvl, timeout=args.timeout, fill_mode=args.fill_mode) else: launch_rocket_to_queue(launchpad, fworker, queueadapter, args.launch_dir, args.reserve, args.loglvl, False, args.fill_mode)
def do_launch(args): if not args.launchpad_file and os.path.exists( os.path.join(args.config_dir, 'my_launchpad.yaml')): args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml') elif not args.launchpad_file: args.launchpad_file = LAUNCHPAD_LOC if not args.fworker_file and os.path.exists( os.path.join(args.config_dir, 'my_fworker.yaml')): args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml') elif not args.fworker_file: args.fworker_file = FWORKER_LOC if not args.queueadapter_file and os.path.exists( os.path.join(args.config_dir, 'my_qadapter.yaml')): args.queueadapter_file = os.path.join(args.config_dir, 'my_qadapter.yaml') elif not args.queueadapter_file: args.queueadapter_file = QUEUEADAPTER_LOC launchpad = LaunchPad.from_file( args.launchpad_file) if args.launchpad_file else LaunchPad( strm_lvl=args.loglvl) fworker = FWorker.from_file( args.fworker_file) if args.fworker_file else FWorker() queueadapter = load_object_from_file(args.queueadapter_file) args.loglvl = 'CRITICAL' if args.silencer else args.loglvl if args.command == 'rapidfire': rapidfire(launchpad, fworker=fworker, qadapter=queueadapter, launch_dir=args.launch_dir, nlaunches=args.nlaunches, njobs_queue=args.maxjobs_queue, njobs_block=args.maxjobs_block, sleep_time=args.sleep, reserve=args.reserve, strm_lvl=args.loglvl, timeout=args.timeout, fill_mode=args.fill_mode) else: launch_rocket_to_queue(launchpad, fworker, queueadapter, args.launch_dir, args.reserve, args.loglvl, False, args.fill_mode, args.fw_id)
def do_launch(args): if not args.launchpad_file and os.path.exists( os.path.join(args.config_dir, 'my_launchpad.yaml')): args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml') if not args.fworker_file and os.path.exists( os.path.join(args.config_dir, 'my_fworker.yaml')): args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml') if not args.queueadapter_file and os.path.exists( os.path.join(args.config_dir, 'my_qadapter.yaml')): args.queueadapter_file = os.path.join(args.config_dir, 'my_qadapter.yaml') launchpad = LaunchPad.from_file( args.launchpad_file) if args.launchpad_file else LaunchPad( strm_lvl=args.loglvl) fworker = FWorker.from_file( args.fworker_file) if args.fworker_file else FWorker() queueadapter = load_object_from_file(args.queueadapter_file) args.loglvl = 'CRITICAL' if args.silencer else args.loglvl if args.command == 'rapidfire': rapidfire(launchpad, fworker, queueadapter, args.launch_dir, args.nlaunches, args.maxjobs_queue, args.maxjobs_block, args.sleep, args.reserve, args.loglvl) else: launch_rocket_to_queue(launchpad, fworker, queueadapter, args.launch_dir, args.reserve, args.loglvl, False)
def qlaunch(lpad_name, fworker_name, number_nodes, walltime, number_jobs, hog): """ Launch jobs to the queue that will accept Fireworks. """ from fireworks.queue.queue_launcher import rapidfire from vscworkflows.config import load_config try: queue_adapter = load_config("qadapter", fworker_name) except FileNotFoundError: raise FileNotFoundError( "Could not find the qadapter of the fireworker in " "$HOME/.workflow_config/fworker. Use 'vsc config fworker' to set up new " "fireworkers." ) queue_adapter["nnodes"] = number_nodes queue_adapter["walltime"] = str(walltime) queue_adapter["launchpad_file"] = os.path.join( os.path.expanduser("~"), ".workflow_config", "launchpad", lpad_name + "_launchpad.yaml" ) queue_adapter["fireworker_file"] = os.path.join( os.path.expanduser("~"), ".workflow_config", "fworker", fworker_name + "_fworker.yaml" ) if hog: queue_adapter["rocket_launch"] = "rapidfire --nlaunches infinite --sleep 10" else: # This line adds the timeout option to the queue_adapter["rocket_launch"] += " --timeout " + str(walltime * 3000) rapidfire(launchpad=load_config("launchpad", lpad_name), fworker=load_config("fworker", fworker_name), qadapter=queue_adapter, launch_dir=queue_adapter["logdir"], nlaunches=number_jobs, njobs_queue=0, njobs_block=500, sleep_time=0, reserve=False, fill_mode=True)
val_spec['validation_eval'] = True val_batch = Firework(RunRBPF_Batch(), spec = val_spec) val_eval = Firework(RunEval(), spec = val_spec) storeResultsFW2 = Firework(StoreResultsInDatabase(), spec=val_spec) first_iter= Firework(Iterate(), spec = spec) workflow = Workflow([init_batch, eval_init, first_iter, val_batch, val_eval, storeResultsFW1, storeResultsFW2], {init_batch: [eval_init], eval_init: [first_iter], val_batch: [val_eval], eval_init:[storeResultsFW1], val_eval:[storeResultsFW2]}) launchpad.add_wf(workflow) qadapter = CommonAdapter.from_file("%sfireworks_files/my_qadapter.yaml" % RBPF_HOME_DIRECTORY) # rapidfire(launchpad, FWorker(), qadapter, launch_dir='.', nlaunches='infinite', njobs_queue=20, # njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, # fill_mode=False) fworker = FWorker() rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches='infinite', njobs_queue=20, njobs_block=500, sleep_time=None, reserve=False, strm_lvl="DEBUG", timeout=None, fill_mode=False)
eval_old_spec['use_corrected_eval'] = False eval_old_firework = Firework(RunEval(), spec=eval_old_spec) eval_new_spec = copy.deepcopy(cur_spec) eval_new_spec['seq_idx_to_eval'] = seq_idx_to_eval eval_new_spec['use_corrected_eval'] = True eval_new_firework = Firework(RunEval(), spec=eval_new_spec) eval_fireworks = [eval_old_firework, eval_new_firework] all_fireworks.extend(run_rbpf_fireworks) all_fireworks.extend(eval_fireworks) for fw in run_rbpf_fireworks: firework_dependencies[fw] = eval_fireworks storeResultsFW = Firework(StoreResultsInDatabase(), spec=eval_new_spec) all_fireworks.append(storeResultsFW) firework_dependencies[eval_old_firework] = storeResultsFW firework_dependencies[eval_new_firework] = storeResultsFW # store workflow and launch it workflow = Workflow(all_fireworks, firework_dependencies) launchpad.add_wf(workflow) qadapter = CommonAdapter.from_file("%sfireworks_files/my_qadapter.yaml" % RBPF_HOME_DIRECTORY) rapidfire(launchpad, FWorker(), qadapter, launch_dir='.', nlaunches='infinite', njobs_queue=81, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, fill_mode=False)
rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite")') rapid_parser.add_argument('--sleep', help='sleep time between loops', default=60, type=int) args = parser.parse_args() if not args.launchpad_file and os.path.exists('my_launchpad.yaml'): args.launchpad_file = 'my_launchpad.yaml' if not args.fworker_file and os.path.exists('my_fworker.yaml'): args.fworker_file = 'my_fworker.yaml' launchpad = LaunchPad.from_file( args.launchpad_file) if args.launchpad_file else None fworker = FWorker.from_file( args.fworker_file) if args.fworker_file else FWorker() rocket_params = QueueParams.from_file(args.queue_params_file) args.loglvl = 'CRITICAL' if args.silencer else args.loglvl # TODO: the number of arguments here is crazy! if args.command == 'rapidfire': rapidfire(rocket_params, args.launch_dir, args.njobs_queue, args.njobs_block, args.loglvl, args.nlaunches, args.sleep, launchpad, fworker, args.reserve) else: launch_rocket_to_queue(rocket_params, args.launch_dir, args.loglvl, launchpad, fworker, args.reserve)
def watcher_daemon(self, log_file): log = None if (log_file): try: log = open(log_file, "w") except: log = log_file #hope its a filehandle instead! print >> sys.stderr, "Log file opened!" #about to fork a process, throw away all handlers. #fireworks will create a new queue log handler to write to test with #lil hacky but who cares right now logging.handlers = [] #FIXME this seems not to have fixed it all the time? old_sys_stdout = sys.stdout self.db.client.close() self.launchpad.connection.close() print >> sys.stderr, "Connections closed, preparing to fork..." with daemon.DaemonContext(stdout=log, stderr=log): logging.handlers = [] dbm = DatabaseManager() self.db = dbm #reconnect to mongo after fork print dbm.find_lpad_config() self.launchpad = fireworks.LaunchPad.from_file( dbm.find_lpad_config()) self.qadapter = dbm.find_qadapter() #add our pid as a running process so new daemons don't get started dbm.client.admin.authenticate("fireworks", "speakfriendandenter") db = dbm.client.daemons #FIXME POSSIBLE CRITICAL RAISE FOR EXTREMELY RAPID WORKFLOW STARTS #ADD MUTEX? running_daemons = db.daemons.find({ "user": getpass.getuser() }).count() if running_daemons > 0: #todo, check pid is alive print >> old_sys_stdout, "Not Forking Daemon- daemon process found" #don't start daemon sys.exit(0) atexit.register(self.cleanup_daemon) db.daemons.insert_one({ "user": getpass.getuser(), "pid": os.getpid() }) while (True): common_adapter = load_object_from_file(self.qadapter) launcher_log_dir = os.path.join(FW_WFLOW_LAUNCH_LOC, getpass.getuser(), "") queue_launcher.rapidfire(self.launchpad, fireworks.FWorker(name="LSF"), common_adapter, reserve=True, nlaunches=0, launch_dir=launcher_log_dir, sleep_time=10, njobs_queue=500) failed_fws = [] time.sleep(50) # offline_runs = self.launchpad.offline_runs.find({"completed": False, "deprecated": False}, {"launch_id": 1}).count() # self.launchpad.m_logger.info("%s offline runs found" % offline_runs) ready_lsf_jobs = self.launchpad.fireworks.find({ "state": "READY", "spec._fworker": "LSF" }).count() reserved_lsf_jobs = self.launchpad.fireworks.find({ "state": "RESERVED", "spec._fworker": "LSF" }).count() running_lsf_jobs = self.launchpad.fireworks.find({ "state": "RUNNING", "spec._fworker": "LSF" }).count() self.launchpad.m_logger.info( "%s ready, %s running, %s reserved lsf jobs found" % (ready_lsf_jobs, running_lsf_jobs, reserved_lsf_jobs)) if (ready_lsf_jobs == 0 and reserved_lsf_jobs == 0 and running_lsf_jobs == 0): break for l in self.launchpad.offline_runs.find( { "completed": False, "deprecated": False }, {"launch_id": 1}): fw = self.launchpad.recover_offline(l['launch_id'], True) if fw: failed_fws.append(fw) self.launchpad.m_logger.info( "FINISHED recovering offline runs.") if failed_fws: self.launchpad.m_logger.info( "FAILED to recover offline fw_ids: {}".format( failed_fws)) db.daemons.remove({"user": getpass.getuser()})
parser.add_argument('--logdir', help='path to a directory for logging', default=None) parser.add_argument('--loglvl', help='level to print log messages', default='INFO') parser.add_argument('--silencer', help='shortcut to mute log messages', action='store_true') parser.add_argument('-r', '--reserve', help='reserve a fw', action='store_true') parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=None) parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=None) rapid_parser.add_argument('-q', '--njobs_queue', help='maximum jobs to keep in queue for this user', default=10, type=int) rapid_parser.add_argument('-b', '--njobs_block', help='maximum jobs to put in a block', default=500, type=int) rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite")') rapid_parser.add_argument('--sleep', help='sleep time between loops', default=60, type=int) args = parser.parse_args() if not args.launchpad_file and os.path.exists('my_launchpad.yaml'): args.launchpad_file = 'my_launchpad.yaml' if not args.fworker_file and os.path.exists('my_fworker.yaml'): args.fworker_file = 'my_fworker.yaml' launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else None fworker = FWorker.from_file(args.fworker_file) if args.fworker_file else FWorker() rocket_params = QueueParams.from_file(args.queue_params_file) args.loglvl = 'CRITICAL' if args.silencer else args.loglvl # TODO: the number of arguments here is crazy! if args.command == 'rapidfire': rapidfire(rocket_params, args.launch_dir, args.njobs_queue, args.njobs_block, args.loglvl, args.nlaunches, args.sleep, launchpad, fworker, args.reserve) else: launch_rocket_to_queue(rocket_params, args.launch_dir, args.loglvl, launchpad, fworker, args.reserve)