Beispiel #1
0
    def test_get_job_stats_logs_errors(self):
        # return
        exp_list = [{
            'model': {
                'name': 'mlp',
                'n_layers': 30
            },
            'dataset': 'mnist',
            'batch_size': 1
        }]
        savedir_base = '/mnt/datasets/public/issam/tmp'
        job_config = {
            'volume': ['/mnt:/mnt'],
            'image': 'images.borgy.elementai.net/issam.laradji/main',
            'bid': '1',
            'restartable': '1',
            'gpu': '1',
            'mem': '20',
            'cpu': '2',
        }
        run_command = ('python example.py -ei <exp_id> -sb %s' %
                       (savedir_base))

        hjb.run_exp_list_jobs(exp_list,
                              savedir_base=savedir_base,
                              workdir=os.path.dirname(
                                  os.path.realpath(__file__)),
                              run_command=run_command,
                              job_config=job_config,
                              force_run=True,
                              wait_seconds=0)
        assert (os.path.exists(
            os.path.join(savedir_base, hu.hash_dict(exp_list[0]),
                         'borgy_dict.json')))
        jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base)
        jm_summary_list = jm.get_summary()
        rm = hr.ResultManager(exp_list=exp_list, savedir_base=savedir_base)
        rm_summary_list = rm.get_job_summary()
        assert (rm_summary_list['table'].equals(jm_summary_list['table']))

        jm.kill_jobs()
        assert ('CANCELLED' in jm.get_summary()['status'][0])
Beispiel #2
0
def run_wizard(func,
               exp_list=None,
               exp_groups=None,
               job_config=None,
               savedir_base=None,
               reset=None,
               args=None,
               use_threads=True,
               exp_id=None):
    if args is None:
        args = get_args()

    # Asserts
    # =======
    savedir_base = savedir_base or args.savedir_base
    reset = reset or args.reset
    exp_id = exp_id or args.exp_id
    assert savedir_base is not None

    # Collect experiments
    # ===================
    if exp_id is not None:
        # select one experiment
        savedir = os.path.join(savedir_base, exp_id)
        exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json"))

        exp_list = [exp_dict]

    elif exp_list is None:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_groups[exp_group_name]

    # save results folder
    if exp_id is None:
        results_fname = args.visualize_notebook
        if len(results_fname):
            if '.ipynb' not in results_fname:
                results_fname += '.ipynb'
            create_jupyter_file(fname=results_fname, savedir_base=savedir_base)

    hu.check_duplicates(exp_list)
    print('\nRunning %d experiments' % len(exp_list))

    # Run experiments
    # ===============
    if not args.run_jobs:
        for exp_dict in exp_list:
            savedir = create_experiment(exp_dict,
                                        savedir_base,
                                        reset=reset,
                                        verbose=True)
            # do trainval
            func(exp_dict=exp_dict, savedir=savedir, args=args)
    else:
        # launch jobs
        from haven import haven_jobs as hjb
        assert job_config is not None
        assert 'account_id' in job_config
        jm = hjb.JobManager(
            exp_list=exp_list,
            savedir_base=savedir_base,
            workdir=os.getcwd(),
            job_config=job_config,
        )

        command = ('python trainval.py -ei <exp_id> -sb %s -d %s' %
                   (savedir_base, args.datadir))

        print(command)
        jm.launch_menu(command=command, in_parallel=use_threads)
Beispiel #3
0
    }

    exp_list = [{
        'model': {
            'name': 'mlp',
            'n_layers': 20
        },
        'dataset': 'mnist',
        'batch_size': 1
    }]
    savedir_base = '/mnt/results/test'

    jm = hjb.JobManager(
        exp_list=exp_list,
        savedir_base=savedir_base,
        workdir=os.path.dirname(os.path.realpath(__file__)),
        job_config=job_config,
        account_id='75ce4cee-6829-4274-80e1-77e89559ddfb',
    )
    # get jobs
    job_list_old = jm.get_jobs()

    # run single command
    savedir_logs = '%s/%s' % (savedir_base, np.random.randint(1000))
    os.makedirs(savedir_logs, exist_ok=True)
    command = 'echo 2'
    job_id = jm.submit_job(command,
                           workdir=jm.workdir,
                           savedir_logs=savedir_logs)

    # get jobs
Beispiel #4
0
                     savedir_base=args.savedir_base,
                     datadir_base=args.datadir_base,
                     reset=args.reset,
                     num_workers=args.num_workers,
                     pin_memory=args.pin_memory,
                     ngpu=args.ngpu,
                     cuda_deterministic=args.cuda_deterministic,
                     )
    else:
        # launch jobs
        from haven import haven_jobs as hjb
        import job_configs as jc
        
        jm = hjb.JobManager(exp_list=exp_list, 
                    savedir_base=args.savedir_base, 
                    account_id=jc.ACCOUNT_ID,
                    workdir=os.path.dirname(os.path.realpath(__file__)),
                    job_config=jc.JOB_CONFIG,
                    )

        command = ("python trainval.py "
                       "-ei <exp_id> "
                       "-sb {savedir_base} "
                       "-d {datadir_base} "
                       "-ng {ngpu} "
                       "-cd {cuda_deterministic} "
                       "-pm {pin_memory} "
                       "-nw {num_workers}".format(savedir_base=args.savedir_base,
                                                            ngpu=args.ngpu,
                                                            cuda_deterministic=args.cuda_deterministic,
                                                            pin_memory=args.pin_memory,
                                                            num_workers=args.num_workers,
Beispiel #5
0
    #             'dataset':'mnist', 'batch_size':1}]
    # savedir_base = '/home/toolkit/home_mnt/data/experiments'
    # job_config = {
    #     'image': 'registry.console.elementai.com/mila.mattie_sandbox.fewshotgan/fewshot-gan',
    #     'data': ['mila.mattie_sandbox.fewshotgan.home:/home/toolkit/home_mnt'],

if __name__ == '__main__':
    # return
    exp_list = [{'model':{'name':'mlp', 'n_layers':20}, 
                'dataset':'mnist', 'batch_size':1}]
    savedir_base = '.tmp'

    
    jm = hjb.JobManager(exp_list=exp_list, 
                    savedir_base=savedir_base, 
                    workdir=os.path.dirname(os.path.realpath(__file__)),
                    job_config=job_config,
                    )
    # get jobs              
    job_list_old = jm.get_jobs()

    # run single command
    savedir_logs = '%s/%s' % (savedir_base, np.random.randint(1000))
    os.makedirs(savedir_logs, exist_ok=True)
    command = 'echo 2'
    job_id = jm.submit_job(command,  workdir=jm.workdir, savedir_logs=savedir_logs)

    # get jobs
    job_list = jm.get_jobs()
    job = jm.get_job(job_id)
    assert job_list[0].id == job_id
Beispiel #6
0
def run_wizard(func, exp_list=None, exp_groups=None, job_config=None, 
                savedir_base=None, 
               reset=None, args=None, use_threads=False,
               exp_id=None, python_binary_path='python', python_file_path=None,
               workdir=None):
    if args is None:
        args = get_args()
        custom_args = {}
    else:
        custom_args = vars(args).copy()
        for k, v in vars(get_args()).items():
            setattr(args, k, v)

    # Asserts
    # =======
    savedir_base = savedir_base or args.savedir_base
    reset = reset or args.reset
    exp_id = exp_id or args.exp_id
    assert savedir_base is not None

    # Collect experiments
    # ===================
    if exp_id is not None:
        # select one experiment
        savedir = os.path.join(savedir_base, exp_id)
        exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json"))

        exp_list = [exp_dict]

    elif exp_list is None:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_groups[exp_group_name]

    # save results folder
    if exp_id is None:
        results_fname = args.visualize_notebook
        if len(results_fname):
            if '.ipynb' not in results_fname:
                results_fname += '.ipynb'
            create_jupyter_file(fname=results_fname,
                                savedir_base=savedir_base)

    hu.check_duplicates(exp_list)
    print('\nRunning %d experiments' % len(exp_list))

    # Run experiments
    # ===============
    if not args.run_jobs:
        for exp_dict in exp_list:
            savedir = create_experiment(exp_dict, savedir_base, reset=reset,
                                        verbose=True)
            # do trainval
            func(exp_dict=exp_dict,
                 savedir=savedir,
                 args=args)
    else:
        # launch jobs
        from haven import haven_jobs as hjb
        assert job_config is not None
        assert 'account_id' in job_config

        if workdir is None:
            workdir = os.getcwd()

        jm = hjb.JobManager(exp_list=exp_list,
                            savedir_base=savedir_base,
                            workdir=workdir,
                            job_config=job_config,
                            )

        if python_file_path is None:
            python_file_path = os.path.split(sys.argv[0])[-1]

        command = (f'{python_binary_path} {python_file_path} -ei <exp_id> -sb {savedir_base}')

        for k, v in custom_args.items():
            if k not in ['savedir_base', 'sb', 'ei', 'exp_id', 'e', 'exp_group_list', 'j', 'run_jobs', 'r', 'reset', 'v', 'visualize_notebook']:
                command += f" --{k} {v}"

        print(command)
        jm.launch_menu(command=command, in_parallel=use_threads)
Beispiel #7
0
def run_wizard(
    func,
    exp_list=None,
    exp_groups=None,
    job_config=None,
    savedir_base=None,
    reset=None,
    args=None,
    use_threads=False,
    exp_id=None,
    python_binary_path="python",
    python_file_path=None,
    workdir=None,
    job_scheduler=None,
    save_logs=True,
    filter_duplicates=False,
    results_fname=None,
):
    if args is None:
        args = get_args()
        custom_args = {}
    else:
        custom_args = vars(args).copy()
        for k, v in vars(get_args()).items():
            if k in custom_args:
                continue
            setattr(args, k, v)

    # Asserts
    # =======
    savedir_base = savedir_base or args.savedir_base
    reset = reset or args.reset
    exp_id = exp_id or args.exp_id
    assert savedir_base is not None

    # Collect experiments
    # ===================
    if exp_id is not None:
        # select one experiment
        savedir = os.path.join(savedir_base, exp_id)
        exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json"))

        exp_list = [exp_dict]

    elif exp_list is None:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_groups[exp_group_name]

    if filter_duplicates:
        n_total = len(exp_list)
        exp_list = hu.filter_duplicates(exp_list)
        print(f"Filtered {len(exp_list)}/{n_total}")

    hu.check_duplicates(exp_list)
    print("\nRunning %d experiments" % len(exp_list))

    # save results folder
    if exp_id is None and results_fname is not None:
        if len(results_fname):
            if ".ipynb" not in results_fname:
                raise ValueError(".ipynb should be the file extension")
            hj.create_jupyter_file(fname=results_fname,
                                   savedir_base=savedir_base)

    # Run experiments
    # ===============
    if job_scheduler is None:
        job_scheduler = args.job_scheduler

    if job_scheduler in [None, "None", "0"]:
        job_scheduler = None

    elif job_scheduler in ["toolkit", "slurm", "gcp"]:
        job_scheduler = args.job_scheduler

    elif job_scheduler in ["1"]:
        job_scheduler = "toolkit"

    else:
        raise ValueError(f"{job_scheduler} does not exist")

    if job_scheduler is None:
        for exp_dict in exp_list:
            savedir = create_experiment(exp_dict,
                                        savedir_base,
                                        reset=reset,
                                        verbose=True)
            # do trainval
            func(exp_dict=exp_dict, savedir=savedir, args=args)

    else:
        # launch jobs
        print(f"Using Job Scheduler: {job_scheduler}")

        from haven import haven_jobs as hjb

        assert job_config is not None
        assert "account_id" in job_config

        if workdir is None:
            workdir = os.getcwd()

        jm = hjb.JobManager(
            exp_list=exp_list,
            savedir_base=savedir_base,
            workdir=workdir,
            job_config=job_config,
            job_scheduler=job_scheduler,
            save_logs=save_logs,
        )

        if python_file_path is None:
            python_file_path = os.path.split(sys.argv[0])[-1]

        command = f"{python_binary_path} {python_file_path} --exp_id <exp_id> --savedir_base {savedir_base}"

        for k, v in custom_args.items():
            if k not in [
                    "savedir_base",
                    "sb",
                    "ei",
                    "exp_id",
                    "e",
                    "exp_group_list",
                    "j",
                    "job_scheduler",
                    "r",
                    "reset",
            ]:
                command += f" --{k} {v}"

        print(command)
        jm.launch_menu(command=command, in_parallel=use_threads)
Beispiel #8
0
        savedir = os.path.join(args.savedir_base, args.exp_id)
        exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json"))

        exp_list = [exp_dict]

    else:
        # select exp group
        exp_list = []
        for exp_group_name in args.exp_group_list:
            exp_list += exp_configs.EXP_GROUPS[exp_group_name]

    # Run experiments
    # ===============
    if args.run_jobs:
        from haven import haven_jobs as hjb
        jm = hjb.JobManager(exp_list=exp_list, savedir_base=args.savedir_base)
        jm_summary_list = jm.get_summary()
        print(jm.get_summary()['status'])

        import usr_configs as uc
        uc.run_jobs(exp_list, args.savedir_base, args.datadir)

    else:
        for exp_dict in exp_list:
            # do trainval
            trainval(exp_dict=exp_dict,
                     savedir_base=args.savedir_base,
                     datadir=args.datadir,
                     reset=args.reset,
                     num_workers=args.num_workers)
Beispiel #9
0
def test_toolkit():
    # toolkit tests
    import job_configs

    exp_list = [{
        "model": {
            "name": "mlp",
            "n_layers": 20
        },
        "dataset": "mnist",
        "batch_size": 1
    }]
    savedir_base = os.path.realpath(".tmp")
    os.makedirs(savedir_base, exist_ok=True)
    jm = hjb.JobManager(
        exp_list=exp_list,
        savedir_base=savedir_base,
        workdir=os.path.dirname(os.path.realpath(__file__)),
        job_config=job_configs.JOB_CONFIG,
    )
    # get jobs
    job_list_old = jm.get_jobs()

    # run single command
    savedir_logs = "%s/%s" % (savedir_base, np.random.randint(1000))
    os.makedirs(savedir_logs, exist_ok=True)
    command = "echo 2"
    job_id = jm.submit_job(command,
                           workdir=jm.workdir,
                           savedir_logs=savedir_logs)

    # get jobs
    job_list = jm.get_jobs()
    job = jm.get_job(job_id)
    assert job_list[0]["id"] == job_id

    # jm.kill_job(job_list[0].id)
    # run
    print("jobs:", len(job_list_old), len(job_list))
    assert (len(job_list_old) + 1) == len(job_list)

    # command_list = []
    # for exp_dict in exp_list:
    #     command_list += []

    # hjb.run_command_list(command_list)
    # jm.launch_menu(command=command)
    jm.launch_exp_list(command="echo 2 -e <exp_id>",
                       reset=1,
                       in_parallel=False)

    assert os.path.exists(
        os.path.join(savedir_base, hu.hash_dict(exp_list[0]), "job_dict.json"))
    summary_list = jm.get_summary_list()
    print(hu.filter_list(summary_list, {"job_state": "SUCCEEDED"}))
    print(hu.group_list(summary_list, key="job_state", return_count=True))

    rm = hr.ResultManager(exp_list=exp_list, savedir_base=savedir_base)
    rm_summary_list = rm.get_job_summary()

    db = hj.get_dashboard(rm, wide_display=True)
    db.display()