Esempio n. 1
0
def get_run_data(run_names, plot_field, method_name, cfg='./config.yaml'):
    config_mgr.init(cfg)

    wb_proj_name = config_mgr.get_prop('proj_name')
    wb_entity = config_mgr.get_prop('wb_entity')

    all_df = None

    api = wandb.Api()
    for run_name in run_names:
        runs = api.runs(f"{wb_entity}/{wb_proj_name}",
                        {"config.prefix": run_name})
        assert len(runs) == 1
        wbrun = next(iter(runs))
        df = wbrun.history(samples=15000)
        df = df[['_step', plot_field]]
        df['run'] = run_name

        if all_df is None:
            all_df = df
        else:
            all_df = pd.concat([all_df, df])

    all_df['method'] = method_name

    return all_df
Esempio n. 2
0
    def backup(self, args, global_step):
        log_dir = osp.join(args.log_dir, args.env_name, args.prefix)
        model_dir = osp.join(args.save_dir, args.env_name, args.prefix)
        vid_dir = osp.join(args.vid_dir, args.env_name, args.prefix)

        log_base_dir = log_dir.rsplit('/', 1)[0]
        model_base_dir = model_dir.rsplit('/', 1)[0]
        vid_base_dir = vid_dir.rsplit('/', 1)[0]
        proj_name = config_mgr.get_prop('proj_name')
        sync_host = config_mgr.get_prop('sync_host')
        sync_user = config_mgr.get_prop('sync_user')
        sync_port = config_mgr.get_prop('sync_port')
        cmds = [
            "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'"
            .format(sync_port, sync_user, sync_host, proj_name, log_dir),
            "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'"
            .format(sync_port, sync_user, sync_host, proj_name, model_dir),
            "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'"
            .format(sync_port, sync_user, sync_host, proj_name, vid_dir),
            'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}'
            .format(sync_port, log_dir, sync_user, sync_host, proj_name,
                    log_base_dir),
            'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}'
            .format(sync_port, model_dir, sync_user, sync_host, proj_name,
                    model_base_dir),
            'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}'
            .format(sync_port, vid_dir, sync_user, sync_host, proj_name,
                    vid_base_dir),
        ]
        os.system("\n".join(cmds))
        print('\n' + '*' * 50)
        print('*' * 5 + ' backup at global step {}'.format(global_step))
        print('*' * 50 + '\n')
        print('')
Esempio n. 3
0
def get_wb_ray_config(config):
    config["wandb"] = {
        "project": config_mgr.get_prop("proj_name"),
        "api_key": config_mgr.get_prop("wb_api_key"),
        "log_config": True,
    }
    return config
Esempio n. 4
0
def convert_to_prefix(run_names, info):
    wb_proj_name = config_mgr.get_prop('proj_name')
    wb_entity = config_mgr.get_prop('wb_entity')
    api = wandb.Api()
    prefixes = []
    for section, run_name in run_names:
        run = api.run(f"{wb_entity}/{wb_proj_name}/{run_name}")
        prefix = run.config['prefix']
        prefixes.append((prefix, section, run.config['env_name'], {
            'section': section,
            'prefix': prefix,
            **info
        }))
    return prefixes
Esempio n. 5
0
def get_cmds(cmd_path, spec_args):
    try:
        open_cmd = osp.join(cmd_path + '.cmd')
        print('opening', open_cmd)
        with open(open_cmd) as f:
            cmds = f.readlines()
    except:
        raise ValueError(f"Command at {cmd_path} does not exist")

    # pay attention to comments
    cmds = list(filter(lambda x: not (x.startswith('#') or x == '\n'), cmds))
    cmds = [cmd.rstrip() + " " for cmd in cmds]

    # Check if any commands are references to other commands
    all_ref_cmds = []
    for i, cmd in enumerate(cmds):
        if cmd.startswith('R:'):
            cmd_parts = cmd.split(':')[1].split(' ')
            ref_cmd_loc = cmd_parts[0]
            full_ref_cmd_loc = osp.join(config_mgr.get_prop('cmds_loc'),
                                        ref_cmd_loc)
            ref_cmds = get_cmds(full_ref_cmd_loc.rstrip(),
                                [*cmd_parts[1:], *spec_args])
            all_ref_cmds.extend(ref_cmds)
    cmds = list(filter(lambda x: not x.startswith('R:'), cmds))

    cmds = [cmd + add_on_args(spec_args) for cmd in cmds]

    cmds.extend(all_ref_cmds)
    return cmds
Esempio n. 6
0
def full_execute_command_file():
    parser = get_arg_parser()
    print(os.getcwd())
    args, rest = parser.parse_known_args()
    config_mgr.init(args.cfg)

    cmd_path = osp.join(config_mgr.get_prop('cmds_loc'), args.cmd)
    execute_command_file(cmd_path, rest, args.cd, args.sess_name, args.sess_id,
                         args.seed, args)
Esempio n. 7
0
 def __init__(self,
              wb_proj_name=None,
              should_log_vids=False,
              wb_entity=None,
              skip_create_wb=False):
     """
     - wb_proj_name: (string) if None, will use the proj_name provided in
       the `config.yaml` file.
     """
     super().__init__()
     if wb_proj_name is None:
         wb_proj_name = config_mgr.get_prop('proj_name')
     if wb_entity is None:
         wb_entity = config_mgr.get_prop('wb_entity')
     self.wb_proj_name = wb_proj_name
     self.wb_entity = wb_entity
     self.should_log_vids = should_log_vids
     self.skip_create_wb = skip_create_wb
Esempio n. 8
0
    def __init__(self, log_interval, args):
        wb_proj_name = config_mgr.get_prop('proj_name')
        wb_entity = config_mgr.get_prop('wb_entity')

        if args.prefix.count('-') >= 4:
            # Remove the seed and random ID info.
            parts = args.prefix.split('-')
            group_id = '-'.join([*parts[:2], *parts[4:]])
        else:
            group_id = None

        wandb.init(project=wb_proj_name,
                   name=args.prefix,
                   entity=wb_entity,
                   group=group_id)
        wandb.config.update(args)

        self.log_dict = {}
        self.env_steps = None
        self.log_interval = log_interval
Esempio n. 9
0
def generate_hab_run_file(log_file, ident, python_path, cmd, prefix, st,
                          ntasks, g, c, use_overcap, args):
    ignore_nodes_s = ",".join(config_mgr.get_prop("slurm_ignore_nodes", []))
    if len(ignore_nodes_s) != 0:
        ignore_nodes_s = '#SBATCH -x ' + ignore_nodes_s

    add_options = [ignore_nodes_s]
    if use_overcap:
        add_options.append('#SBATCH --account=overcap')
    if args.time is not None:
        add_options.append(f"#SBATCH --time={args.time}")
    add_options = '\n'.join(add_options)

    pre_python_txt = ''
    python_parts = cmd.split("python")
    has_python = False
    if len(python_parts) > 1:
        pre_python_txt = python_parts[0]
        cmd = "python" + python_parts[1]
        has_python = True

    cpu_options = '#SBATCH --cpus-per-task %i' % int(c)
    if args.speed:
        cpu_options = '#SBATCH --overcommit\n'
        cpu_options += '#SBATCH --cpu-freq=performance\n'
        cpu_options += '#SBATCH -c $(((${SLURM_CPUS_PER_TASK} * ${SLURM_TASKS_PER_NODE})))'

    if has_python:
        run_cmd = python_path + "/" + cmd
        requeue_s = '#SBATCH --requeue'
    else:
        run_cmd = cmd
        requeue_s = ''

    fcontents = """#!/bin/bash
#SBATCH --job-name=%s
#SBATCH --output=%s
#SBATCH --gres gpu:%i
%s
#SBATCH --nodes 1
#SBATCH --signal=USR1@600
#SBATCH --ntasks-per-node %i
%s
#SBATCH -p %s
%s

export GLOG_minloglevel=2
export MAGNUM_LOG=quiet
export MULTI_PROC_OFFSET=%i

export MASTER_ADDR=$(srun --ntasks=1 hostname 2>&1 | tail -n1)

set -x
srun %s"""
    if prefix is not None:
        job_name = prefix + '_' + ident
    else:
        job_name = ident
    log_file_loc = '/'.join(log_file.split('/')[:-1])
    fcontents = fcontents % (job_name, log_file, int(g), cpu_options,
                             int(ntasks), requeue_s, st, add_options,
                             args.mp_offset, run_cmd)
    job_file = osp.join(log_file_loc, job_name + '.sh')
    with open(job_file, 'w') as f:
        f.write(fcontents)
    return job_file, job_name
Esempio n. 10
0
def execute_command_file(cmd_path, add_args_str, cd, sess_name, sess_id, seed,
                         args):
    cmds = get_cmds(cmd_path, add_args_str)

    n_seeds = 1
    if args.cmd_format == 'reg':
        cmd_format = '--'
        spacer = ' '
    elif args.cmd_format == 'nodash':
        cmd_format = ''
        spacer = '='
    else:
        raise ValueError(f"{args.cmd_format} does not match anything")

    if seed is not None and len(seed.split(',')) > 1:
        seeds = seed.split(',')
        common_id = ''.join(
            random.sample(string.ascii_uppercase + string.digits, k=2))

        cmds = [transform_prefix(cmd, common_id) for cmd in cmds]
        cmds = [
            cmd + f" {cmd_format}seed{spacer}{seed}" for cmd in cmds
            for seed in seeds
        ]
        n_seeds = len(seeds)
    elif seed is not None:
        cmds = [x + f" {cmd_format}seed{spacer}{seed}" for x in cmds]
    add_on = ''

    if (len(cmds) // n_seeds) > 1:
        # Make sure all the commands share the last part of the prefix so they can
        # find each other. The name is long because its really bad if a job
        # finds the wrong other job.
        common_id = ''.join(
            random.sample(string.ascii_uppercase + string.ascii_lowercase +
                          string.digits,
                          k=6))
        cmds = [transform_prefix(cmd, common_id) for cmd in cmds]

    if args.pt_proc != -1:
        pt_dist_str = f"MULTI_PROC_OFFSET={args.mp_offset} python -u -m torch.distributed.launch --use_env --nproc_per_node {args.pt_proc} "

        def make_dist_cmd(x):
            parts = x.split(' ')
            runf = None
            for i, part in enumerate(parts):
                if '.py' in part:
                    runf = i
                    break

            if runf is None:
                raise ValueError('Could not split command')

            rest = ' '.join(parts[runf:])
            return pt_dist_str + rest

        cmds[0] = make_dist_cmd(cmds[0])

    if args.debug is not None:
        print('IN DEBUG MODE')
        cmds = [cmds[args.debug]]

    env_vars = " ".join(config_mgr.get_prop('add_env_vars', []))
    if len(env_vars) != 0:
        env_vars += " "

    if sess_id == -1:
        if len(cmds) == 1:
            exec_cmd = cmds[0]
            if cd != '-1':
                exec_cmd = 'CUDA_VISIBLE_DEVICES=' + cd + ' ' + exec_cmd + ' ' + add_on
            else:
                exec_cmd = exec_cmd + ' ' + add_on
            if not args.skip_env:
                exec_cmd = env_vars + exec_cmd
            print('executing ', exec_cmd)
            os.system(exec_cmd)
        else:
            raise ValueError(
                'Running multiple jobs. You must specify tmux session id')
    else:

        def as_list(x):
            if isinstance(x, int):
                return [x for _ in cmds]
            x = x.split('|')
            if len(x) == 1:
                x = [x[0] for _ in cmds]
            return x

        cd = as_list(cd)
        ntasks = as_list(args.ntasks)
        g = as_list(args.g)
        c = as_list(args.c)

        for cmd_idx, cmd in enumerate(cmds):
            new_window = get_tmux_window(sess_name, sess_id)
            cmd += ' ' + add_on
            print('running full command %s\n' % cmd)

            # Send the keys to run the command
            conda_env = config_mgr.get_prop('conda_env')
            if args.st is None:
                if not args.skip_env:
                    cmd = env_vars + cmd
                last_pane = new_window.attached_pane
                last_pane.send_keys(cmd, enter=False)
                pane = new_window.split_window(attach=False)
                pane.set_height(height=50)
                pane.send_keys('source deactivate')

                pane.send_keys('source activate ' + conda_env)
                pane.enter()
                if cd[cmd_idx] != '-1':
                    pane.send_keys('export CUDA_VISIBLE_DEVICES=' +
                                   cd[cmd_idx])
                    pane.enter()
                pane.send_keys(cmd)
                pane.enter()
            else:
                # Make command into a SLURM command
                base_data_dir = config_mgr.get_prop("base_data_dir")
                python_path = osp.join(osp.expanduser("~"), "miniconda3",
                                       "envs", conda_env, "bin")
                runs_dir = "data/log/runs"
                if not osp.exists(runs_dir):
                    os.makedirs(runs_dir)

                parts = cmd.split(" ")
                prefix = None
                for i, x in enumerate(parts):
                    if x == '--prefix' or x == 'PREFIX':
                        prefix = parts[i + 1].replace('"', '')
                        break

                new_log_dir = osp.join(base_data_dir, 'log')
                new_vids_dir = osp.join(base_data_dir, 'vids')
                new_save_dir = osp.join(base_data_dir, 'trained_models')
                ident = str(uuid.uuid4())[:8]
                log_file = osp.join(runs_dir, ident) + ".log"

                last_pane = new_window.attached_pane
                last_pane.send_keys(f"tail -f {log_file}", enter=False)
                pane = new_window.split_window(attach=False)
                pane.set_height(height=10)

                new_dirs = []
                cmd_args = cmd.split(' ')
                if not args.skip_add:
                    for k, v in config_mgr.get_prop('change_cmds').items():
                        if k in cmd_args:
                            continue
                        new_dirs.append(k + " " + osp.join(base_data_dir, v))
                    cmd += " " + (" ".join(new_dirs))

                if not args.slurm_no_batch:
                    run_file, run_name = generate_hab_run_file(
                        log_file, ident, python_path, cmd, prefix, args.st,
                        ntasks[cmd_idx], g[cmd_idx], c[cmd_idx], args.overcap,
                        args)
                    print(f"Running file at {run_file}")
                    pane.send_keys(f"sbatch {run_file}")
                    time.sleep(2)
                    pane.send_keys(f"scancel {run_name}", enter=False)
                else:
                    srun_settings = f"--gres=gpu:{args.g} " + \
                            f"-p {args.st} " + \
                            f"-c {args.c} " + \
                            f"-J {prefix}_{ident} " + \
                            f"-o {log_file}"

                    # This assumes the command begins with "python ..."
                    cmd = f"srun {srun_settings} {python_path}/{cmd}"
                    pane.send_keys(cmd)

        print('everything should be running...')
Esempio n. 11
0
import os


def get_arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg', type=str, default='./config.yaml')
    parser.add_argument('--sync-dirs', type=str, default=None)
    return parser


if __name__ == '__main__':
    parser = get_arg_parser()
    args = parser.parse_args()
    config_mgr.init(args.cfg)

    proj_name = config_mgr.get_prop('proj_name')
    sync_host = config_mgr.get_prop('sync_host')
    sync_user = config_mgr.get_prop('sync_user')
    sync_port = config_mgr.get_prop('sync_port')

    cmds = []
    for sync_dir in args.sync_dirs.split(','):
        parent_sync_dir = '/'.join(sync_dir.split('/')[:-1])
        cmds.extend([
            "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'"
            .format(sync_port, sync_user, sync_host, proj_name, sync_dir),
            'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}'
            .format(sync_port, sync_dir, sync_user, sync_host, proj_name,
                    parent_sync_dir),
        ])
Esempio n. 12
0
parser.add_argument('--env', default=None, type=str)
parser.add_argument('--type',
                    default='policy',
                    type=str,
                    help='"emb" or "policy"')
parser.add_argument('--save-name',
                    default='def',
                    type=str,
                    help='Name of the file to save')
parser.add_argument('--no-wand', action='store_true', default=False)
parser.add_argument('--lvl', default=None, type=str, help='Create game level')
parser.add_argument('--test-runs', default=None, type=str, help='')
parser.add_argument('--train-eval', action='store_true', default=False)
parser.add_argument('--render-high', action='store_true', default=False)

cmd_folder = config_mgr.get_prop('cmds_loc')


def add_on_args(spec_args):
    spec_args = ['"' + x + '"' if ' ' in x else x for x in spec_args]
    return ((' '.join(spec_args)))


def get_cmds(cmd_loc, train_type, spec_args, args):
    try:
        open_cmd = osp.join(cmd_folder, cmd_loc + '.cmd')
        print('opening', open_cmd)
        with open(open_cmd) as f:
            cmds = f.readlines()
    except:
        raise ValueError('Must place %s command in %s' % (cmd_loc, cmd_folder))
Esempio n. 13
0
def get_report_data(report_name,
                    plot_field,
                    plot_sections,
                    force_refresh=False,
                    match_pat=None,
                    other_plot_fields=[],
                    cfg='./config.yaml',
                    other_fetch_fields=[],
                    get_any_cols=False):
    """
    Converts the selected data sets in a W&B report into a Pandas DataFrame.
    Fetches only the plot_field you specify.
    - get_any_cols: If true, will filter plot_field to be the subset of columns
      which in the report.
    """
    config_mgr.init(cfg)

    wb_proj_name = config_mgr.get_prop('proj_name')
    wb_entity = config_mgr.get_prop('wb_entity')
    wb_search = config_mgr.get_prop('wb_search',
                                    wb_entity + '/' + wb_proj_name)

    save_report_name = report_name.replace(' ', '-').replace("/", "-")
    cacher = CacheHelper(f"{wb_entity}_{wb_proj_name}_{save_report_name}",
                         plot_sections)
    all_df = None

    if cacher.exists() and not force_refresh:
        all_df = cacher.load()
        uniq_methods = all_df['method'].unique()
        for k in uniq_methods:
            idx = plot_sections.index(k)
            del plot_sections[idx]
        if len(plot_sections) == 0:
            return all_df

    api = wandb.Api()
    run_ids = get_run_ids_from_report(wb_search, report_name, plot_sections,
                                      api)
    for report_section, run_id in run_ids:
        wbrun = api.run(f"{wb_entity}/{wb_proj_name}/{run_id}")
        if match_pat is not None:
            any_matches = False
            for x in match_pat:
                if x in wbrun.name:
                    any_matches = True
                    break
            if not any_matches:
                continue
        df = wbrun.history(samples=15000)

        if not isinstance(plot_field, str):
            orig_not_found = False
            for k in plot_field:
                if k not in df.columns:
                    orig_not_found = True
                    break
            if orig_not_found:
                if len(other_plot_fields) > 0:
                    plot_field = other_plot_fields
                if get_any_cols:
                    plot_field = [x for x in plot_field if x in df.columns]
                for k in plot_field:
                    if k not in df.columns:
                        raise ValueError(
                            (f"Requested key {k} is not present in",
                             f" data frame with {df.columns} for run {run_id}",
                             f" section {report_section}"))

            df = df[['_step', *plot_field]]
        else:
            if plot_field not in df.columns:
                match_other_plot = None
                for k in other_plot_fields:
                    if k in df.columns:
                        match_other_plot = k
                        break
                if match_other_plot is None:
                    raise ValueError("""
                            Could not find colums from %s in %s containing %s
                            """ % (str(other_plot_fields), report_section,
                                   str(df.columns)))
                df = df.rename(columns={match_other_plot: plot_field})
            df = df[['_step', plot_field]]

        if len(other_fetch_fields) > 0:
            run_cfg = json.loads(wbrun.json_config)
            for k in other_fetch_fields:
                parts = k.split('.')
                cur_d = run_cfg
                for part in parts:
                    cur_d = cur_d[part]
                    if isinstance(cur_d, dict):
                        cur_d = cur_d['value']
                df[k] = cur_d
        df['method'] = report_section
        df['run'] = run_id

        if all_df is None:
            all_df = df
        else:
            all_df = pd.concat([all_df, df])

    if all_df is None:
        raise ValueError(
            f"Could not find any matching reports on wb for {report_name}")

    uniq_methods = all_df['method'].unique()
    for plot_section in plot_sections:
        assert plot_section in uniq_methods, f"'{plot_section}' from {uniq_methods} not found"

    cacher.save(all_df)

    return all_df
Esempio n. 14
0
def eval_from_file(plot_cfg_path, load_dir, get_run_settings, args):
    with open(plot_cfg_path) as f:
        eval_settings = yaml.load(f)
        config_mgr.init(eval_settings['config_yaml'])
        eval_key = eval_settings['eval_key']
        scale_factor = eval_settings['scale_factor']
        rename_sections = eval_settings['rename_sections']
        wb_proj_name = config_mgr.get_prop('proj_name')
        wb_entity = config_mgr.get_prop('wb_entity')
        api = wandb.Api()
        all_run_names = []
        for eval_section in eval_settings['eval_sections']:
            report_name = eval_section['report_name']
            eval_sections = eval_section['eval_sections']
            cacher = CacheHelper(report_name, eval_sections)
            if cacher.exists() and not eval_section['force_reload']:
                run_names = cacher.load()
            else:
                run_ids = get_run_ids_from_report(wb_entity, wb_proj_name,
                                                  report_name, eval_sections,
                                                  api)
                run_names = convert_to_prefix(run_ids,
                                              {'report_name': report_name})
                cacher.save(run_names)
            all_run_names.extend(run_names)

    full_load_name = osp.join(load_dir, 'data/trained_models')
    full_log_name = osp.join(load_dir, 'data/log')
    method_names = defaultdict(list)
    for name, method_name, env_name, info in all_run_names:
        model_dir = osp.join(full_load_name, env_name, name)
        cmd_path = osp.join(full_log_name, env_name, name)

        if not osp.exists(model_dir):
            raise ValueError(f"Model {model_dir} does not exist", info)

        if not osp.exists(cmd_path):
            raise ValueError(f"Model {cmd_path} does not exist")

        model_nums = [
            int(x.split('_')[1].split('.')[0]) for x in os.listdir(model_dir)
            if 'model_' in x
        ]
        if len(model_nums) == 0:
            raise ValueError(f"Model {model_dir} is empty", info)

        max_idx = max(model_nums)
        use_model = osp.join(model_dir, f"model_{max_idx}.pt")

        with open(osp.join(cmd_path, 'cmd.txt'), 'r') as f:
            cmd = f.read()

        method_names[method_name].append((use_model, cmd, env_name, info))

    env_results = defaultdict(lambda: defaultdict(list))
    NUM_PROCS = 20

    total_count = sum([len(x) for x in method_names.values()])

    done_count = 0
    for method_name, runs in method_names.items():
        for use_model, cmd, env_name, info in runs:
            print(f"({done_count}/{total_count})")
            done_count += 1
            cache_result = CacheHelper(
                f"result_{method_name}_{use_model.replace('/', '_')}_{args.num_eval}",
                cmd)
            if cache_result.exists() and not args.override:
                eval_result = cache_result.load()
            else:
                if args.table_only and not args.override:
                    break
                cmd = cmd.split(' ')[2:]
                cmd.append('--no-wb')
                cmd.append('--eval-only')
                cmd.extend(['--cuda', 'False'])
                cmd.extend(['--num-render', '0'])
                cmd.extend(['--eval-num-processes', str(NUM_PROCS)])
                cmd.extend(["--num-eval", f"{args.num_eval // NUM_PROCS}"])
                cmd.extend(["--load-file", use_model])
                run_settings = get_run_settings(cmd)
                run_settings.setup()
                eval_result = run_settings.eval_result
                cache_result.save(eval_result)
            store_num = eval_result[args.get_key]
            env_results[info['report_name']][method_name].append(store_num)
            rutils.pstart_sep()
            print(f"Result for {use_model}: {store_num}")
            rutils.pend_sep()
    print(generate_eval_table(env_results, scale_factor, rename_sections))