Example #1
0
    def test_get_job_stats_logs_errors(self):
        # return
        exp_list = [{
            'model': {
                'name': 'mlp',
                'n_layers': 30
            },
            'dataset': 'mnist',
            'batch_size': 1
        }]
        savedir_base = '/mnt/datasets/public/issam/tmp'
        job_config = {
            'volume': ['/mnt:/mnt'],
            'image': 'images.borgy.elementai.net/issam.laradji/main',
            'bid': '1',
            'restartable': '1',
            'gpu': '1',
            'mem': '20',
            'cpu': '2',
        }
        run_command = ('python example.py -ei <exp_id> -sb %s' %
                       (savedir_base))

        hjb.run_exp_list_jobs(exp_list,
                              savedir_base=savedir_base,
                              workdir=os.path.dirname(
                                  os.path.realpath(__file__)),
                              run_command=run_command,
                              job_config=job_config,
                              force_run=True,
                              wait_seconds=0)
        assert (os.path.exists(
            os.path.join(savedir_base, hu.hash_dict(exp_list[0]),
                         'borgy_dict.json')))
        jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base)
        jm_summary_list = jm.get_summary()
        rm = hr.ResultManager(exp_list=exp_list, savedir_base=savedir_base)
        rm_summary_list = rm.get_job_summary()
        assert (rm_summary_list['table'].equals(jm_summary_list['table']))

        jm.kill_jobs()
        assert ('CANCELLED' in jm.get_summary()['status'][0])
Example #2
0
        exp_list = [exp_dict]

    else:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_configs.EXP_GROUPS[exp_group_name]

    # Run experiments
    # ---------------
    if not args.run_jobs:
        # run experiments sequentially
        for exp_dict in exp_list:
            # do trainval
            trainval(exp_dict=exp_dict,
                     savedir_base=args.savedir_base,
                     reset=args.reset)

    else:
        import job_config
        # run experiments in parallel
        run_command = ('python trainval.py -ei <exp_id> -sb %s' %
                       (args.savedir_base))
        hjb.run_exp_list_jobs(exp_list,
                              savedir_base=args.savedir_base,
                              workdir=os.path.dirname(
                                  os.path.realpath(__file__)),
                              run_command=run_command,
                              job_config=job_config.job_config)
Example #3
0
    # -------------------
    if args.exp_id is not None:
        # select one experiment
        savedir = os.path.join(args.savedir_base, args.exp_id)
        exp_dict = hu.load_json(os.path.join(savedir, 'exp_dict.json'))

        exp_list = [exp_dict]

    else:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_configs.EXP_GROUPS[exp_group_name]

    # Run experiments or View them
    # ----------------------------
    if args.run_jobs:
        # launch jobs
        from haven import haven_jobs as hj
        hj.run_exp_list_jobs(exp_list,
                             savedir_base=args.savedir_base,
                             workdir=os.path.dirname(
                                 os.path.realpath(__file__)))

    else:
        # run experiments
        for exp_dict in exp_list:
            # do trainval
            trainval(exp_dict=exp_dict,
                     savedir_base=args.savedir_base,
                     reset=args.reset)
        exp_list = [exp_dict]

    else:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += EXP_GROUPS[exp_group_name]

    # Run experiments or View them
    # ----------------------------
    if args.run_jobs:
        from haven import haven_jobs as hjb
        from .job_config import job_config
        run_command = ('python trainval.py -ei <exp_id> -sb %s -nw %d -d %s' %
                       (args.savedir_base, args.num_workers, args.data_root))
        workdir = os.path.dirname(os.path.realpath(__file__))
        hjb.run_exp_list_jobs(exp_list,
                              savedir_base=args.savedir_base,
                              workdir=workdir,
                              run_command=run_command,
                              job_config=job_config)

    else:
        # run experiments
        for exp_dict in exp_list:
            # do trainval
            trainval(exp_dict=exp_dict,
                     savedir_base=args.savedir_base,
                     data_root=args.data_root,
                     reset=args.reset)
Example #5
0
            % (args.savedir_base, args.data_root, args.wandb, args.wandb_key))
        job_config = {
            'volume': [
                "PATH/TO/VOLUME",
            ],
            'image': 'PATH/TO/IMAGE',
            'gpu': '1',
            'mem': '32',
            'bid': '1',
            'restartable': '1',
            'cpu': '4',
        }
        workdir = os.path.dirname(os.path.realpath(__file__))

        hjb.run_exp_list_jobs(exp_list,
                              savedir_base=args.savedir_base,
                              workdir=workdir,
                              run_command=run_command,
                              job_config=job_config,
                              username=args.wandb_username)  # change username

    else:
        # run experiments
        for exp_dict in exp_list:
            # do trainval
            trainval(exp_dict=exp_dict,
                     savedir_base=args.savedir_base,
                     data_root=args.data_root,
                     reset=args.reset,
                     wandb=args.wandb,
                     wandb_key=args.wandb_key)
Example #6
0
        exp_list = [exp_dict]

    else:
        # Select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_configs.EXP_GROUPS[exp_group_name]

    # Launch jobs on compute cluster
    if False:
        from haven import haven_jobs as hj
        run_command = ('python train.py -ei <exp_id> '
                       '-fid %d -sb %s -u %s -t %s' %
                       (args.compute_fid, args.savedir_base, args.username,
                        args.use_tensorboard))
        hj.run_exp_list_jobs(
            exp_list,
            savedir_base=args.savedir_base,
            workdir=os.path.dirname(os.path.realpath(__file__)),
            run_command=run_command,
            job_utils_path=exp_configs.JOB_UTILS_PATH,
            job_config=exp_configs.BORGY_CONFIGS[args.username])
    # Launch jobs locally
    else:
        # Run experiments
        for exp_dict in exp_list:
            train(exp_dict=exp_dict,
                  savedir_base=args.savedir_base,
                  reset=args.reset)