Ejemplo n.º 1
0
def do_launch(args):
    if not args.launchpad_file and os.path.exists(
            os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')
    elif not args.launchpad_file:
        args.launchpad_file = LAUNCHPAD_LOC

    if not args.fworker_file and os.path.exists(
            os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')
    elif not args.fworker_file:
        args.fworker_file = FWORKER_LOC

    if not args.queueadapter_file and os.path.exists(
            os.path.join(args.config_dir, 'my_qadapter.yaml')):
        args.queueadapter_file = os.path.join(args.config_dir, 'my_qadapter.yaml')
    elif not args.queueadapter_file:
        args.queueadapter_file = QUEUEADAPTER_LOC

    launchpad = LaunchPad.from_file(
        args.launchpad_file) if args.launchpad_file else LaunchPad(
        strm_lvl=args.loglvl)
    fworker = FWorker.from_file(
        args.fworker_file) if args.fworker_file else FWorker()
    queueadapter = load_object_from_file(args.queueadapter_file)
    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker=fworker, qadapter=queueadapter, launch_dir=args.launch_dir,
                  nlaunches=args.nlaunches, njobs_queue=args.maxjobs_queue,
                  njobs_block=args.maxjobs_block, sleep_time=args.sleep,
                  reserve=args.reserve, strm_lvl=args.loglvl, timeout=args.timeout, fill_mode=args.fill_mode)
    else:
        launch_rocket_to_queue(launchpad, fworker, queueadapter,
                               args.launch_dir, args.reserve, args.loglvl, False, args.fill_mode)
Ejemplo n.º 2
0
def do_launch(args):
    if not args.launchpad_file and os.path.exists(
            os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')
    elif not args.launchpad_file:
        args.launchpad_file = LAUNCHPAD_LOC

    if not args.fworker_file and os.path.exists(
            os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')
    elif not args.fworker_file:
        args.fworker_file = FWORKER_LOC

    if not args.queueadapter_file and os.path.exists(
            os.path.join(args.config_dir, 'my_qadapter.yaml')):
        args.queueadapter_file = os.path.join(args.config_dir, 'my_qadapter.yaml')
    elif not args.queueadapter_file:
        args.queueadapter_file = QUEUEADAPTER_LOC

    launchpad = LaunchPad.from_file(
        args.launchpad_file) if args.launchpad_file else LaunchPad(
        strm_lvl=args.loglvl)
    fworker = FWorker.from_file(
        args.fworker_file) if args.fworker_file else FWorker()
    queueadapter = load_object_from_file(args.queueadapter_file)
    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker=fworker, qadapter=queueadapter, launch_dir=args.launch_dir,
                  nlaunches=args.nlaunches, njobs_queue=args.maxjobs_queue,
                  njobs_block=args.maxjobs_block, sleep_time=args.sleep,
                  reserve=args.reserve, strm_lvl=args.loglvl, timeout=args.timeout, fill_mode=args.fill_mode)
    else:
        launch_rocket_to_queue(launchpad, fworker, queueadapter,
                               args.launch_dir, args.reserve, args.loglvl, False, args.fill_mode, args.fw_id)
Ejemplo n.º 3
0
def do_launch(args):
    if not args.launchpad_file and os.path.exists(
            os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')

    if not args.fworker_file and os.path.exists(
            os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')

    if not args.queueadapter_file and os.path.exists(
            os.path.join(args.config_dir, 'my_qadapter.yaml')):
        args.queueadapter_file = os.path.join(args.config_dir,
                                              'my_qadapter.yaml')

    launchpad = LaunchPad.from_file(
        args.launchpad_file) if args.launchpad_file else LaunchPad(
        strm_lvl=args.loglvl)
    fworker = FWorker.from_file(
        args.fworker_file) if args.fworker_file else FWorker()
    queueadapter = load_object_from_file(args.queueadapter_file)
    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker, queueadapter, args.launch_dir,
                  args.nlaunches, args.maxjobs_queue,
                  args.maxjobs_block, args.sleep, args.reserve, args.loglvl)
    else:
        launch_rocket_to_queue(launchpad, fworker, queueadapter,
                               args.launch_dir, args.reserve, args.loglvl, False)
Ejemplo n.º 4
0
def qlaunch(lpad_name, fworker_name, number_nodes, walltime, number_jobs, hog):
    """
    Launch jobs to the queue that will accept Fireworks.

    """
    from fireworks.queue.queue_launcher import rapidfire
    from vscworkflows.config import load_config

    try:
        queue_adapter = load_config("qadapter", fworker_name)
    except FileNotFoundError:
        raise FileNotFoundError(
            "Could not find the qadapter of the fireworker in "
            "$HOME/.workflow_config/fworker. Use 'vsc config fworker' to set up new "
            "fireworkers."
        )
    queue_adapter["nnodes"] = number_nodes
    queue_adapter["walltime"] = str(walltime)
    queue_adapter["launchpad_file"] = os.path.join(
        os.path.expanduser("~"), ".workflow_config", "launchpad",
        lpad_name + "_launchpad.yaml"
    )
    queue_adapter["fireworker_file"] = os.path.join(
        os.path.expanduser("~"), ".workflow_config", "fworker",
        fworker_name + "_fworker.yaml"
    )
    if hog:
        queue_adapter["rocket_launch"] = "rapidfire --nlaunches infinite --sleep 10"
    else:
        # This line adds the timeout option to the
        queue_adapter["rocket_launch"] += " --timeout " + str(walltime * 3000)

    rapidfire(launchpad=load_config("launchpad", lpad_name),
              fworker=load_config("fworker", fworker_name), qadapter=queue_adapter,
              launch_dir=queue_adapter["logdir"], nlaunches=number_jobs,
              njobs_queue=0, njobs_block=500,
              sleep_time=0, reserve=False, fill_mode=True)
    val_spec['validation_eval'] = True   
    val_batch = Firework(RunRBPF_Batch(), spec = val_spec)    
    val_eval = Firework(RunEval(), spec = val_spec)
    storeResultsFW2 = Firework(StoreResultsInDatabase(), spec=val_spec)

    first_iter= Firework(Iterate(), spec = spec)
    workflow = Workflow([init_batch, eval_init, first_iter, val_batch, val_eval, storeResultsFW1, storeResultsFW2], 
                        {init_batch: [eval_init], eval_init: [first_iter], val_batch: [val_eval], eval_init:[storeResultsFW1], val_eval:[storeResultsFW2]})

    launchpad.add_wf(workflow)
    qadapter = CommonAdapter.from_file("%sfireworks_files/my_qadapter.yaml" % RBPF_HOME_DIRECTORY)
#    rapidfire(launchpad, FWorker(), qadapter, launch_dir='.', nlaunches='infinite', njobs_queue=20,
#                  njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None,
#                  fill_mode=False)


    fworker = FWorker()
    rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches='infinite', njobs_queue=20,
      njobs_block=500, sleep_time=None, reserve=False, strm_lvl="DEBUG", timeout=None,
      fill_mode=False)










Ejemplo n.º 6
0
                                            eval_old_spec['use_corrected_eval'] = False
                                            eval_old_firework = Firework(RunEval(), spec=eval_old_spec)

                                            eval_new_spec = copy.deepcopy(cur_spec)
                                            eval_new_spec['seq_idx_to_eval'] = seq_idx_to_eval 
                                            eval_new_spec['use_corrected_eval'] = True
                                            eval_new_firework = Firework(RunEval(), spec=eval_new_spec)

                                            eval_fireworks = [eval_old_firework, eval_new_firework]
                                            all_fireworks.extend(run_rbpf_fireworks)
                                            all_fireworks.extend(eval_fireworks)
                                            for fw in run_rbpf_fireworks:
                                                firework_dependencies[fw] = eval_fireworks

                                            storeResultsFW = Firework(StoreResultsInDatabase(), spec=eval_new_spec)
                                            all_fireworks.append(storeResultsFW)
                                            firework_dependencies[eval_old_firework] = storeResultsFW
                                            firework_dependencies[eval_new_firework] = storeResultsFW



    # store workflow and launch it
    workflow = Workflow(all_fireworks, firework_dependencies)
    launchpad.add_wf(workflow)
    qadapter = CommonAdapter.from_file("%sfireworks_files/my_qadapter.yaml" % RBPF_HOME_DIRECTORY)
    rapidfire(launchpad, FWorker(), qadapter, launch_dir='.', nlaunches='infinite', njobs_queue=81,
                  njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None,
                  fill_mode=False)


Ejemplo n.º 7
0
    rapid_parser.add_argument('--nlaunches',
                              help='num_launches (int or "infinite")')
    rapid_parser.add_argument('--sleep',
                              help='sleep time between loops',
                              default=60,
                              type=int)

    args = parser.parse_args()

    if not args.launchpad_file and os.path.exists('my_launchpad.yaml'):
        args.launchpad_file = 'my_launchpad.yaml'

    if not args.fworker_file and os.path.exists('my_fworker.yaml'):
        args.fworker_file = 'my_fworker.yaml'

    launchpad = LaunchPad.from_file(
        args.launchpad_file) if args.launchpad_file else None
    fworker = FWorker.from_file(
        args.fworker_file) if args.fworker_file else FWorker()
    rocket_params = QueueParams.from_file(args.queue_params_file)
    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    # TODO: the number of arguments here is crazy!
    if args.command == 'rapidfire':
        rapidfire(rocket_params, args.launch_dir, args.njobs_queue,
                  args.njobs_block, args.loglvl, args.nlaunches, args.sleep,
                  launchpad, fworker, args.reserve)
    else:
        launch_rocket_to_queue(rocket_params, args.launch_dir, args.loglvl,
                               launchpad, fworker, args.reserve)
Ejemplo n.º 8
0
    def watcher_daemon(self, log_file):
        log = None

        if (log_file):
            try:
                log = open(log_file, "w")
            except:
                log = log_file  #hope its a filehandle instead!
        print >> sys.stderr, "Log file opened!"
        #about to fork a process, throw away all handlers.
        #fireworks will create a new queue log handler to write to test with
        #lil hacky but who cares right now
        logging.handlers = []
        #FIXME this seems not to have fixed it all the time?
        old_sys_stdout = sys.stdout
        self.db.client.close()
        self.launchpad.connection.close()
        print >> sys.stderr, "Connections closed, preparing to fork..."
        with daemon.DaemonContext(stdout=log, stderr=log):
            logging.handlers = []
            dbm = DatabaseManager()
            self.db = dbm
            #reconnect to mongo after fork
            print dbm.find_lpad_config()
            self.launchpad = fireworks.LaunchPad.from_file(
                dbm.find_lpad_config())
            self.qadapter = dbm.find_qadapter()
            #add our pid as a running process so new daemons don't get started
            dbm.client.admin.authenticate("fireworks", "speakfriendandenter")
            db = dbm.client.daemons
            #FIXME POSSIBLE CRITICAL RAISE FOR EXTREMELY RAPID WORKFLOW STARTS
            #ADD MUTEX?
            running_daemons = db.daemons.find({
                "user": getpass.getuser()
            }).count()
            if running_daemons > 0:
                #todo, check pid is alive
                print >> old_sys_stdout, "Not Forking Daemon- daemon process found"
                #don't start daemon
                sys.exit(0)
            atexit.register(self.cleanup_daemon)
            db.daemons.insert_one({
                "user": getpass.getuser(),
                "pid": os.getpid()
            })

            while (True):

                common_adapter = load_object_from_file(self.qadapter)
                launcher_log_dir = os.path.join(FW_WFLOW_LAUNCH_LOC,
                                                getpass.getuser(), "")
                queue_launcher.rapidfire(self.launchpad,
                                         fireworks.FWorker(name="LSF"),
                                         common_adapter,
                                         reserve=True,
                                         nlaunches=0,
                                         launch_dir=launcher_log_dir,
                                         sleep_time=10,
                                         njobs_queue=500)
                failed_fws = []
                time.sleep(50)
                #                offline_runs =  self.launchpad.offline_runs.find({"completed": False, "deprecated": False}, {"launch_id": 1}).count()
                #                self.launchpad.m_logger.info("%s offline runs found" % offline_runs)
                ready_lsf_jobs = self.launchpad.fireworks.find({
                    "state":
                    "READY",
                    "spec._fworker":
                    "LSF"
                }).count()
                reserved_lsf_jobs = self.launchpad.fireworks.find({
                    "state":
                    "RESERVED",
                    "spec._fworker":
                    "LSF"
                }).count()
                running_lsf_jobs = self.launchpad.fireworks.find({
                    "state":
                    "RUNNING",
                    "spec._fworker":
                    "LSF"
                }).count()

                self.launchpad.m_logger.info(
                    "%s ready, %s running, %s reserved lsf jobs found" %
                    (ready_lsf_jobs, running_lsf_jobs, reserved_lsf_jobs))
                if (ready_lsf_jobs == 0 and reserved_lsf_jobs == 0
                        and running_lsf_jobs == 0):
                    break
                for l in self.launchpad.offline_runs.find(
                    {
                        "completed": False,
                        "deprecated": False
                    }, {"launch_id": 1}):
                    fw = self.launchpad.recover_offline(l['launch_id'], True)
                    if fw:
                        failed_fws.append(fw)
                self.launchpad.m_logger.info(
                    "FINISHED recovering offline runs.")
                if failed_fws:
                    self.launchpad.m_logger.info(
                        "FAILED to recover offline fw_ids: {}".format(
                            failed_fws))
            db.daemons.remove({"user": getpass.getuser()})
Ejemplo n.º 9
0
    parser.add_argument('--logdir', help='path to a directory for logging', default=None)
    parser.add_argument('--loglvl', help='level to print log messages', default='INFO')
    parser.add_argument('--silencer', help='shortcut to mute log messages', action='store_true')
    parser.add_argument('-r', '--reserve', help='reserve a fw', action='store_true')
    parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=None)
    parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=None)

    rapid_parser.add_argument('-q', '--njobs_queue', help='maximum jobs to keep in queue for this user', default=10, type=int)
    rapid_parser.add_argument('-b', '--njobs_block', help='maximum jobs to put in a block', default=500, type=int)
    rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite")')
    rapid_parser.add_argument('--sleep', help='sleep time between loops', default=60, type=int)

    args = parser.parse_args()

    if not args.launchpad_file and os.path.exists('my_launchpad.yaml'):
        args.launchpad_file = 'my_launchpad.yaml'

    if not args.fworker_file and os.path.exists('my_fworker.yaml'):
        args.fworker_file = 'my_fworker.yaml'

    launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else None
    fworker = FWorker.from_file(args.fworker_file) if args.fworker_file else FWorker()
    rocket_params = QueueParams.from_file(args.queue_params_file)
    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    # TODO: the number of arguments here is crazy!
    if args.command == 'rapidfire':
        rapidfire(rocket_params, args.launch_dir, args.njobs_queue, args.njobs_block, args.loglvl, args.nlaunches, args.sleep, launchpad, fworker, args.reserve)
    else:
        launch_rocket_to_queue(rocket_params, args.launch_dir, args.loglvl, launchpad, fworker, args.reserve)