def test_get_job_stats_logs_errors(self): # return exp_list = [{ 'model': { 'name': 'mlp', 'n_layers': 30 }, 'dataset': 'mnist', 'batch_size': 1 }] savedir_base = '/mnt/datasets/public/issam/tmp' job_config = { 'volume': ['/mnt:/mnt'], 'image': 'images.borgy.elementai.net/issam.laradji/main', 'bid': '1', 'restartable': '1', 'gpu': '1', 'mem': '20', 'cpu': '2', } run_command = ('python example.py -ei <exp_id> -sb %s' % (savedir_base)) hjb.run_exp_list_jobs(exp_list, savedir_base=savedir_base, workdir=os.path.dirname( os.path.realpath(__file__)), run_command=run_command, job_config=job_config, force_run=True, wait_seconds=0) assert (os.path.exists( os.path.join(savedir_base, hu.hash_dict(exp_list[0]), 'borgy_dict.json'))) jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base) jm_summary_list = jm.get_summary() rm = hr.ResultManager(exp_list=exp_list, savedir_base=savedir_base) rm_summary_list = rm.get_job_summary() assert (rm_summary_list['table'].equals(jm_summary_list['table'])) jm.kill_jobs() assert ('CANCELLED' in jm.get_summary()['status'][0])
exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # Run experiments # --------------- if not args.run_jobs: # run experiments sequentially for exp_dict in exp_list: # do trainval trainval(exp_dict=exp_dict, savedir_base=args.savedir_base, reset=args.reset) else: import job_config # run experiments in parallel run_command = ('python trainval.py -ei <exp_id> -sb %s' % (args.savedir_base)) hjb.run_exp_list_jobs(exp_list, savedir_base=args.savedir_base, workdir=os.path.dirname( os.path.realpath(__file__)), run_command=run_command, job_config=job_config.job_config)
# ------------------- if args.exp_id is not None: # select one experiment savedir = os.path.join(args.savedir_base, args.exp_id) exp_dict = hu.load_json(os.path.join(savedir, 'exp_dict.json')) exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # Run experiments or View them # ---------------------------- if args.run_jobs: # launch jobs from haven import haven_jobs as hj hj.run_exp_list_jobs(exp_list, savedir_base=args.savedir_base, workdir=os.path.dirname( os.path.realpath(__file__))) else: # run experiments for exp_dict in exp_list: # do trainval trainval(exp_dict=exp_dict, savedir_base=args.savedir_base, reset=args.reset)
exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += EXP_GROUPS[exp_group_name] # Run experiments or View them # ---------------------------- if args.run_jobs: from haven import haven_jobs as hjb from .job_config import job_config run_command = ('python trainval.py -ei <exp_id> -sb %s -nw %d -d %s' % (args.savedir_base, args.num_workers, args.data_root)) workdir = os.path.dirname(os.path.realpath(__file__)) hjb.run_exp_list_jobs(exp_list, savedir_base=args.savedir_base, workdir=workdir, run_command=run_command, job_config=job_config) else: # run experiments for exp_dict in exp_list: # do trainval trainval(exp_dict=exp_dict, savedir_base=args.savedir_base, data_root=args.data_root, reset=args.reset)
% (args.savedir_base, args.data_root, args.wandb, args.wandb_key)) job_config = { 'volume': [ "PATH/TO/VOLUME", ], 'image': 'PATH/TO/IMAGE', 'gpu': '1', 'mem': '32', 'bid': '1', 'restartable': '1', 'cpu': '4', } workdir = os.path.dirname(os.path.realpath(__file__)) hjb.run_exp_list_jobs(exp_list, savedir_base=args.savedir_base, workdir=workdir, run_command=run_command, job_config=job_config, username=args.wandb_username) # change username else: # run experiments for exp_dict in exp_list: # do trainval trainval(exp_dict=exp_dict, savedir_base=args.savedir_base, data_root=args.data_root, reset=args.reset, wandb=args.wandb, wandb_key=args.wandb_key)
exp_list = [exp_dict] else: # Select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # Launch jobs on compute cluster if False: from haven import haven_jobs as hj run_command = ('python train.py -ei <exp_id> ' '-fid %d -sb %s -u %s -t %s' % (args.compute_fid, args.savedir_base, args.username, args.use_tensorboard)) hj.run_exp_list_jobs( exp_list, savedir_base=args.savedir_base, workdir=os.path.dirname(os.path.realpath(__file__)), run_command=run_command, job_utils_path=exp_configs.JOB_UTILS_PATH, job_config=exp_configs.BORGY_CONFIGS[args.username]) # Launch jobs locally else: # Run experiments for exp_dict in exp_list: train(exp_dict=exp_dict, savedir_base=args.savedir_base, reset=args.reset)