def get_run_data(run_names, plot_field, method_name, cfg='./config.yaml'): config_mgr.init(cfg) wb_proj_name = config_mgr.get_prop('proj_name') wb_entity = config_mgr.get_prop('wb_entity') all_df = None api = wandb.Api() for run_name in run_names: runs = api.runs(f"{wb_entity}/{wb_proj_name}", {"config.prefix": run_name}) assert len(runs) == 1 wbrun = next(iter(runs)) df = wbrun.history(samples=15000) df = df[['_step', plot_field]] df['run'] = run_name if all_df is None: all_df = df else: all_df = pd.concat([all_df, df]) all_df['method'] = method_name return all_df
def backup(self, args, global_step): log_dir = osp.join(args.log_dir, args.env_name, args.prefix) model_dir = osp.join(args.save_dir, args.env_name, args.prefix) vid_dir = osp.join(args.vid_dir, args.env_name, args.prefix) log_base_dir = log_dir.rsplit('/', 1)[0] model_base_dir = model_dir.rsplit('/', 1)[0] vid_base_dir = vid_dir.rsplit('/', 1)[0] proj_name = config_mgr.get_prop('proj_name') sync_host = config_mgr.get_prop('sync_host') sync_user = config_mgr.get_prop('sync_user') sync_port = config_mgr.get_prop('sync_port') cmds = [ "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'" .format(sync_port, sync_user, sync_host, proj_name, log_dir), "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'" .format(sync_port, sync_user, sync_host, proj_name, model_dir), "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'" .format(sync_port, sync_user, sync_host, proj_name, vid_dir), 'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}' .format(sync_port, log_dir, sync_user, sync_host, proj_name, log_base_dir), 'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}' .format(sync_port, model_dir, sync_user, sync_host, proj_name, model_base_dir), 'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}' .format(sync_port, vid_dir, sync_user, sync_host, proj_name, vid_base_dir), ] os.system("\n".join(cmds)) print('\n' + '*' * 50) print('*' * 5 + ' backup at global step {}'.format(global_step)) print('*' * 50 + '\n') print('')
def get_wb_ray_config(config): config["wandb"] = { "project": config_mgr.get_prop("proj_name"), "api_key": config_mgr.get_prop("wb_api_key"), "log_config": True, } return config
def convert_to_prefix(run_names, info): wb_proj_name = config_mgr.get_prop('proj_name') wb_entity = config_mgr.get_prop('wb_entity') api = wandb.Api() prefixes = [] for section, run_name in run_names: run = api.run(f"{wb_entity}/{wb_proj_name}/{run_name}") prefix = run.config['prefix'] prefixes.append((prefix, section, run.config['env_name'], { 'section': section, 'prefix': prefix, **info })) return prefixes
def get_cmds(cmd_path, spec_args): try: open_cmd = osp.join(cmd_path + '.cmd') print('opening', open_cmd) with open(open_cmd) as f: cmds = f.readlines() except: raise ValueError(f"Command at {cmd_path} does not exist") # pay attention to comments cmds = list(filter(lambda x: not (x.startswith('#') or x == '\n'), cmds)) cmds = [cmd.rstrip() + " " for cmd in cmds] # Check if any commands are references to other commands all_ref_cmds = [] for i, cmd in enumerate(cmds): if cmd.startswith('R:'): cmd_parts = cmd.split(':')[1].split(' ') ref_cmd_loc = cmd_parts[0] full_ref_cmd_loc = osp.join(config_mgr.get_prop('cmds_loc'), ref_cmd_loc) ref_cmds = get_cmds(full_ref_cmd_loc.rstrip(), [*cmd_parts[1:], *spec_args]) all_ref_cmds.extend(ref_cmds) cmds = list(filter(lambda x: not x.startswith('R:'), cmds)) cmds = [cmd + add_on_args(spec_args) for cmd in cmds] cmds.extend(all_ref_cmds) return cmds
def full_execute_command_file(): parser = get_arg_parser() print(os.getcwd()) args, rest = parser.parse_known_args() config_mgr.init(args.cfg) cmd_path = osp.join(config_mgr.get_prop('cmds_loc'), args.cmd) execute_command_file(cmd_path, rest, args.cd, args.sess_name, args.sess_id, args.seed, args)
def __init__(self, wb_proj_name=None, should_log_vids=False, wb_entity=None, skip_create_wb=False): """ - wb_proj_name: (string) if None, will use the proj_name provided in the `config.yaml` file. """ super().__init__() if wb_proj_name is None: wb_proj_name = config_mgr.get_prop('proj_name') if wb_entity is None: wb_entity = config_mgr.get_prop('wb_entity') self.wb_proj_name = wb_proj_name self.wb_entity = wb_entity self.should_log_vids = should_log_vids self.skip_create_wb = skip_create_wb
def __init__(self, log_interval, args): wb_proj_name = config_mgr.get_prop('proj_name') wb_entity = config_mgr.get_prop('wb_entity') if args.prefix.count('-') >= 4: # Remove the seed and random ID info. parts = args.prefix.split('-') group_id = '-'.join([*parts[:2], *parts[4:]]) else: group_id = None wandb.init(project=wb_proj_name, name=args.prefix, entity=wb_entity, group=group_id) wandb.config.update(args) self.log_dict = {} self.env_steps = None self.log_interval = log_interval
def generate_hab_run_file(log_file, ident, python_path, cmd, prefix, st, ntasks, g, c, use_overcap, args): ignore_nodes_s = ",".join(config_mgr.get_prop("slurm_ignore_nodes", [])) if len(ignore_nodes_s) != 0: ignore_nodes_s = '#SBATCH -x ' + ignore_nodes_s add_options = [ignore_nodes_s] if use_overcap: add_options.append('#SBATCH --account=overcap') if args.time is not None: add_options.append(f"#SBATCH --time={args.time}") add_options = '\n'.join(add_options) pre_python_txt = '' python_parts = cmd.split("python") has_python = False if len(python_parts) > 1: pre_python_txt = python_parts[0] cmd = "python" + python_parts[1] has_python = True cpu_options = '#SBATCH --cpus-per-task %i' % int(c) if args.speed: cpu_options = '#SBATCH --overcommit\n' cpu_options += '#SBATCH --cpu-freq=performance\n' cpu_options += '#SBATCH -c $(((${SLURM_CPUS_PER_TASK} * ${SLURM_TASKS_PER_NODE})))' if has_python: run_cmd = python_path + "/" + cmd requeue_s = '#SBATCH --requeue' else: run_cmd = cmd requeue_s = '' fcontents = """#!/bin/bash #SBATCH --job-name=%s #SBATCH --output=%s #SBATCH --gres gpu:%i %s #SBATCH --nodes 1 #SBATCH --signal=USR1@600 #SBATCH --ntasks-per-node %i %s #SBATCH -p %s %s export GLOG_minloglevel=2 export MAGNUM_LOG=quiet export MULTI_PROC_OFFSET=%i export MASTER_ADDR=$(srun --ntasks=1 hostname 2>&1 | tail -n1) set -x srun %s""" if prefix is not None: job_name = prefix + '_' + ident else: job_name = ident log_file_loc = '/'.join(log_file.split('/')[:-1]) fcontents = fcontents % (job_name, log_file, int(g), cpu_options, int(ntasks), requeue_s, st, add_options, args.mp_offset, run_cmd) job_file = osp.join(log_file_loc, job_name + '.sh') with open(job_file, 'w') as f: f.write(fcontents) return job_file, job_name
def execute_command_file(cmd_path, add_args_str, cd, sess_name, sess_id, seed, args): cmds = get_cmds(cmd_path, add_args_str) n_seeds = 1 if args.cmd_format == 'reg': cmd_format = '--' spacer = ' ' elif args.cmd_format == 'nodash': cmd_format = '' spacer = '=' else: raise ValueError(f"{args.cmd_format} does not match anything") if seed is not None and len(seed.split(',')) > 1: seeds = seed.split(',') common_id = ''.join( random.sample(string.ascii_uppercase + string.digits, k=2)) cmds = [transform_prefix(cmd, common_id) for cmd in cmds] cmds = [ cmd + f" {cmd_format}seed{spacer}{seed}" for cmd in cmds for seed in seeds ] n_seeds = len(seeds) elif seed is not None: cmds = [x + f" {cmd_format}seed{spacer}{seed}" for x in cmds] add_on = '' if (len(cmds) // n_seeds) > 1: # Make sure all the commands share the last part of the prefix so they can # find each other. The name is long because its really bad if a job # finds the wrong other job. common_id = ''.join( random.sample(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=6)) cmds = [transform_prefix(cmd, common_id) for cmd in cmds] if args.pt_proc != -1: pt_dist_str = f"MULTI_PROC_OFFSET={args.mp_offset} python -u -m torch.distributed.launch --use_env --nproc_per_node {args.pt_proc} " def make_dist_cmd(x): parts = x.split(' ') runf = None for i, part in enumerate(parts): if '.py' in part: runf = i break if runf is None: raise ValueError('Could not split command') rest = ' '.join(parts[runf:]) return pt_dist_str + rest cmds[0] = make_dist_cmd(cmds[0]) if args.debug is not None: print('IN DEBUG MODE') cmds = [cmds[args.debug]] env_vars = " ".join(config_mgr.get_prop('add_env_vars', [])) if len(env_vars) != 0: env_vars += " " if sess_id == -1: if len(cmds) == 1: exec_cmd = cmds[0] if cd != '-1': exec_cmd = 'CUDA_VISIBLE_DEVICES=' + cd + ' ' + exec_cmd + ' ' + add_on else: exec_cmd = exec_cmd + ' ' + add_on if not args.skip_env: exec_cmd = env_vars + exec_cmd print('executing ', exec_cmd) os.system(exec_cmd) else: raise ValueError( 'Running multiple jobs. You must specify tmux session id') else: def as_list(x): if isinstance(x, int): return [x for _ in cmds] x = x.split('|') if len(x) == 1: x = [x[0] for _ in cmds] return x cd = as_list(cd) ntasks = as_list(args.ntasks) g = as_list(args.g) c = as_list(args.c) for cmd_idx, cmd in enumerate(cmds): new_window = get_tmux_window(sess_name, sess_id) cmd += ' ' + add_on print('running full command %s\n' % cmd) # Send the keys to run the command conda_env = config_mgr.get_prop('conda_env') if args.st is None: if not args.skip_env: cmd = env_vars + cmd last_pane = new_window.attached_pane last_pane.send_keys(cmd, enter=False) pane = new_window.split_window(attach=False) pane.set_height(height=50) pane.send_keys('source deactivate') pane.send_keys('source activate ' + conda_env) pane.enter() if cd[cmd_idx] != '-1': pane.send_keys('export CUDA_VISIBLE_DEVICES=' + cd[cmd_idx]) pane.enter() pane.send_keys(cmd) pane.enter() else: # Make command into a SLURM command base_data_dir = config_mgr.get_prop("base_data_dir") python_path = osp.join(osp.expanduser("~"), "miniconda3", "envs", conda_env, "bin") runs_dir = "data/log/runs" if not osp.exists(runs_dir): os.makedirs(runs_dir) parts = cmd.split(" ") prefix = None for i, x in enumerate(parts): if x == '--prefix' or x == 'PREFIX': prefix = parts[i + 1].replace('"', '') break new_log_dir = osp.join(base_data_dir, 'log') new_vids_dir = osp.join(base_data_dir, 'vids') new_save_dir = osp.join(base_data_dir, 'trained_models') ident = str(uuid.uuid4())[:8] log_file = osp.join(runs_dir, ident) + ".log" last_pane = new_window.attached_pane last_pane.send_keys(f"tail -f {log_file}", enter=False) pane = new_window.split_window(attach=False) pane.set_height(height=10) new_dirs = [] cmd_args = cmd.split(' ') if not args.skip_add: for k, v in config_mgr.get_prop('change_cmds').items(): if k in cmd_args: continue new_dirs.append(k + " " + osp.join(base_data_dir, v)) cmd += " " + (" ".join(new_dirs)) if not args.slurm_no_batch: run_file, run_name = generate_hab_run_file( log_file, ident, python_path, cmd, prefix, args.st, ntasks[cmd_idx], g[cmd_idx], c[cmd_idx], args.overcap, args) print(f"Running file at {run_file}") pane.send_keys(f"sbatch {run_file}") time.sleep(2) pane.send_keys(f"scancel {run_name}", enter=False) else: srun_settings = f"--gres=gpu:{args.g} " + \ f"-p {args.st} " + \ f"-c {args.c} " + \ f"-J {prefix}_{ident} " + \ f"-o {log_file}" # This assumes the command begins with "python ..." cmd = f"srun {srun_settings} {python_path}/{cmd}" pane.send_keys(cmd) print('everything should be running...')
import os def get_arg_parser(): parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='./config.yaml') parser.add_argument('--sync-dirs', type=str, default=None) return parser if __name__ == '__main__': parser = get_arg_parser() args = parser.parse_args() config_mgr.init(args.cfg) proj_name = config_mgr.get_prop('proj_name') sync_host = config_mgr.get_prop('sync_host') sync_user = config_mgr.get_prop('sync_user') sync_port = config_mgr.get_prop('sync_port') cmds = [] for sync_dir in args.sync_dirs.split(','): parent_sync_dir = '/'.join(sync_dir.split('/')[:-1]) cmds.extend([ "ssh -i ~/.ssh/id_open_rsa/id -p {} {}@{} 'mkdir -p ~/{}_backup/{}'" .format(sync_port, sync_user, sync_host, proj_name, sync_dir), 'rsync -avuzhr -e "ssh -i ~/.ssh/id_open_rsa/id -p {}" {} {}@{}:~/{}_backup/{}' .format(sync_port, sync_dir, sync_user, sync_host, proj_name, parent_sync_dir), ])
parser.add_argument('--env', default=None, type=str) parser.add_argument('--type', default='policy', type=str, help='"emb" or "policy"') parser.add_argument('--save-name', default='def', type=str, help='Name of the file to save') parser.add_argument('--no-wand', action='store_true', default=False) parser.add_argument('--lvl', default=None, type=str, help='Create game level') parser.add_argument('--test-runs', default=None, type=str, help='') parser.add_argument('--train-eval', action='store_true', default=False) parser.add_argument('--render-high', action='store_true', default=False) cmd_folder = config_mgr.get_prop('cmds_loc') def add_on_args(spec_args): spec_args = ['"' + x + '"' if ' ' in x else x for x in spec_args] return ((' '.join(spec_args))) def get_cmds(cmd_loc, train_type, spec_args, args): try: open_cmd = osp.join(cmd_folder, cmd_loc + '.cmd') print('opening', open_cmd) with open(open_cmd) as f: cmds = f.readlines() except: raise ValueError('Must place %s command in %s' % (cmd_loc, cmd_folder))
def get_report_data(report_name, plot_field, plot_sections, force_refresh=False, match_pat=None, other_plot_fields=[], cfg='./config.yaml', other_fetch_fields=[], get_any_cols=False): """ Converts the selected data sets in a W&B report into a Pandas DataFrame. Fetches only the plot_field you specify. - get_any_cols: If true, will filter plot_field to be the subset of columns which in the report. """ config_mgr.init(cfg) wb_proj_name = config_mgr.get_prop('proj_name') wb_entity = config_mgr.get_prop('wb_entity') wb_search = config_mgr.get_prop('wb_search', wb_entity + '/' + wb_proj_name) save_report_name = report_name.replace(' ', '-').replace("/", "-") cacher = CacheHelper(f"{wb_entity}_{wb_proj_name}_{save_report_name}", plot_sections) all_df = None if cacher.exists() and not force_refresh: all_df = cacher.load() uniq_methods = all_df['method'].unique() for k in uniq_methods: idx = plot_sections.index(k) del plot_sections[idx] if len(plot_sections) == 0: return all_df api = wandb.Api() run_ids = get_run_ids_from_report(wb_search, report_name, plot_sections, api) for report_section, run_id in run_ids: wbrun = api.run(f"{wb_entity}/{wb_proj_name}/{run_id}") if match_pat is not None: any_matches = False for x in match_pat: if x in wbrun.name: any_matches = True break if not any_matches: continue df = wbrun.history(samples=15000) if not isinstance(plot_field, str): orig_not_found = False for k in plot_field: if k not in df.columns: orig_not_found = True break if orig_not_found: if len(other_plot_fields) > 0: plot_field = other_plot_fields if get_any_cols: plot_field = [x for x in plot_field if x in df.columns] for k in plot_field: if k not in df.columns: raise ValueError( (f"Requested key {k} is not present in", f" data frame with {df.columns} for run {run_id}", f" section {report_section}")) df = df[['_step', *plot_field]] else: if plot_field not in df.columns: match_other_plot = None for k in other_plot_fields: if k in df.columns: match_other_plot = k break if match_other_plot is None: raise ValueError(""" Could not find colums from %s in %s containing %s """ % (str(other_plot_fields), report_section, str(df.columns))) df = df.rename(columns={match_other_plot: plot_field}) df = df[['_step', plot_field]] if len(other_fetch_fields) > 0: run_cfg = json.loads(wbrun.json_config) for k in other_fetch_fields: parts = k.split('.') cur_d = run_cfg for part in parts: cur_d = cur_d[part] if isinstance(cur_d, dict): cur_d = cur_d['value'] df[k] = cur_d df['method'] = report_section df['run'] = run_id if all_df is None: all_df = df else: all_df = pd.concat([all_df, df]) if all_df is None: raise ValueError( f"Could not find any matching reports on wb for {report_name}") uniq_methods = all_df['method'].unique() for plot_section in plot_sections: assert plot_section in uniq_methods, f"'{plot_section}' from {uniq_methods} not found" cacher.save(all_df) return all_df
def eval_from_file(plot_cfg_path, load_dir, get_run_settings, args): with open(plot_cfg_path) as f: eval_settings = yaml.load(f) config_mgr.init(eval_settings['config_yaml']) eval_key = eval_settings['eval_key'] scale_factor = eval_settings['scale_factor'] rename_sections = eval_settings['rename_sections'] wb_proj_name = config_mgr.get_prop('proj_name') wb_entity = config_mgr.get_prop('wb_entity') api = wandb.Api() all_run_names = [] for eval_section in eval_settings['eval_sections']: report_name = eval_section['report_name'] eval_sections = eval_section['eval_sections'] cacher = CacheHelper(report_name, eval_sections) if cacher.exists() and not eval_section['force_reload']: run_names = cacher.load() else: run_ids = get_run_ids_from_report(wb_entity, wb_proj_name, report_name, eval_sections, api) run_names = convert_to_prefix(run_ids, {'report_name': report_name}) cacher.save(run_names) all_run_names.extend(run_names) full_load_name = osp.join(load_dir, 'data/trained_models') full_log_name = osp.join(load_dir, 'data/log') method_names = defaultdict(list) for name, method_name, env_name, info in all_run_names: model_dir = osp.join(full_load_name, env_name, name) cmd_path = osp.join(full_log_name, env_name, name) if not osp.exists(model_dir): raise ValueError(f"Model {model_dir} does not exist", info) if not osp.exists(cmd_path): raise ValueError(f"Model {cmd_path} does not exist") model_nums = [ int(x.split('_')[1].split('.')[0]) for x in os.listdir(model_dir) if 'model_' in x ] if len(model_nums) == 0: raise ValueError(f"Model {model_dir} is empty", info) max_idx = max(model_nums) use_model = osp.join(model_dir, f"model_{max_idx}.pt") with open(osp.join(cmd_path, 'cmd.txt'), 'r') as f: cmd = f.read() method_names[method_name].append((use_model, cmd, env_name, info)) env_results = defaultdict(lambda: defaultdict(list)) NUM_PROCS = 20 total_count = sum([len(x) for x in method_names.values()]) done_count = 0 for method_name, runs in method_names.items(): for use_model, cmd, env_name, info in runs: print(f"({done_count}/{total_count})") done_count += 1 cache_result = CacheHelper( f"result_{method_name}_{use_model.replace('/', '_')}_{args.num_eval}", cmd) if cache_result.exists() and not args.override: eval_result = cache_result.load() else: if args.table_only and not args.override: break cmd = cmd.split(' ')[2:] cmd.append('--no-wb') cmd.append('--eval-only') cmd.extend(['--cuda', 'False']) cmd.extend(['--num-render', '0']) cmd.extend(['--eval-num-processes', str(NUM_PROCS)]) cmd.extend(["--num-eval", f"{args.num_eval // NUM_PROCS}"]) cmd.extend(["--load-file", use_model]) run_settings = get_run_settings(cmd) run_settings.setup() eval_result = run_settings.eval_result cache_result.save(eval_result) store_num = eval_result[args.get_key] env_results[info['report_name']][method_name].append(store_num) rutils.pstart_sep() print(f"Result for {use_model}: {store_num}") rutils.pend_sep() print(generate_eval_table(env_results, scale_factor, rename_sections))