def setup(self, argv): """After reading the component name this function will be called.""" args = parser.parse_args(args=argv) self.args = args self.experiment_id = args.experiment_id self.work_id = args.work_id self.experiment_name = args.experiment_name self.batch_size = args.batch_size self.traj_length = args.traj_length self.seed = args.seed self.results_folder = args.results_folder self.hyper_params = args.hyper_configs self.env_config = ConfigDict(to_nested_dicts(args.env_config)) self.sess_config = ConfigDict(to_nested_dicts(args.sess_config)) self.agent_config = ConfigDict(to_nested_dicts(args.agent_config)) if hasattr(args, 'eval_config'): self.eval_config = ConfigDict(to_nested_dicts(args.eval_config)) else: self.eval_config = ConfigDict() check_config_compatibility(self.env_config, self.sess_config, self.agent_config, self.eval_config)
def main(argv): args = parser.parse_args(argv[1:]) assert args.command == 'mode' cluster = Cluster.new('tmux') exp = cluster.new_experiment(EXP_NAME) exp.set_preamble_cmds(PREAMBLE_CMDS) serv, cli = create_program(exp) if args.mode == 'localhost': node = localhost_setup() localhost_placement(serv, cli, node) elif args.mode == 'ssh': nodeloader = NodeLoader( ConfigDict(argon.to_nested_dicts(args.cluster_config)), args.filter_regex) nodes = nodeloader.nodes if len(nodes) != 1: raise Exception( 'For this test condition, please specify just a single ssh node.' ) ssh_node = nodes[0] ssh_node.setup(res_files=RES_FILES) local_node = localhost_setup() ssh_placement(serv, cli, local_node, ssh_node) elif args.mode == 'slurm': nodeloader = NodeLoader( ConfigDict(argon.to_nested_dicts(args.cluster_config)), args.filter_regex) nodes = nodeloader.nodes if len(nodes) != 1: raise Exception( 'For this test condition, please specify just a single slurm node.' ) slurm_node = nodes[0] slurm_node.setup(res_files=RES_FILES) local_node = localhost_setup() slurm_placement(serv, cli, local_node, slurm_node) else: raise Exception('Unknown mode %s' % args.mode) try: cluster.launch(exp) while True: time.sleep(100000) except KeyboardInterrupt: cluster.delete(experiment_name=EXP_NAME)
def make_env(k_val): env_config = ConfigDict(to_nested_dicts(args.env_config)) env_config.lp_features = False env_config.k = k_val env_config.n_local_moves = args.n_local_moves env_config.primal_gap_reward = True env_config.delta_reward = False env_config.disable_maxcuts = args.disable_maxcuts assert env_config.n_graphs == 1 env_class = U.import_obj(env_config.class_name, env_config.class_path) env = env_class(id=0, seed=args.seed, **env_config) return env
def main(argv): global args args = parser.parse_args(argv[1:]) if args.gpu_ids: os.environ['CUDA_VISIBLE_DEVICES'] = '_'.join(map(str, args.gpu_ids)) else: os.environ['CUDA_VISIBLE_DEVICES'] = '' sess_config = ConfigDict(to_nested_dicts(args.sess_config)) env_config = ConfigDict(to_nested_dicts(args.env_config)) agent_config = ConfigDict(to_nested_dicts(args.agent_config)) shell_class = U.import_obj(sess_config.shell.class_name, sess_config.shell.class_path) env_class = U.import_obj(env_config.class_name, env_config.class_path) agent_class = U.import_obj(agent_config.class_name, agent_config.class_path) if args.standalone: if args.heuristic: results_dir = Path( f'/data/nms/tfp/evaluation/standalone/{args.heuristic}/{args.name}/{env_config.graph_start_idx}/' ) else: results_dir = Path( f'/data/nms/tfp/evaluation/standalone/agent/{args.name}/{env_config.graph_start_idx}/' ) elif args.without_agent: results_dir = Path( f'/data/nms/tfp/evaluation/without_agent/{args.name}/{env_config.graph_start_idx}/' ) elif args.heuristic: results_dir = Path( f'/data/nms/tfp/evaluation/{args.heuristic}/{args.name}/{env_config.graph_start_idx}/' ) else: results_dir = Path( f'/data/nms/tfp/evaluation/scip/{args.name}/{env_config.graph_start_idx}' ) results_dir.mkdir(parents=True, exist_ok=True) evaluator = Evaluator(shell_class=shell_class, shell_config=sess_config.shell, agent_class=agent_class, agent_config=agent_config, env_class=env_class, env_config=env_config, seed=args.seed, dataset=env_config.dataset, dataset_type=env_config.dataset_type, graph_start_idx=env_config.graph_start_idx, gap=args.gap, max_nodes=args.max_nodes, batch_size=args.batch_size, n_local_moves=args.n_local_moves, results_dir=results_dir, use_parallel_envs=args.use_parallel_envs, use_threaded_envs=args.use_threaded_envs, heur_frequency=args.heur_frequency, create_shell=(args.heuristic is None), **sess_config) evaluator.run(standalone=args.standalone, without_agent=args.without_agent, heuristic=args.heuristic) print('Done!')
def train(argv): tp = TurrealParser() tp.add_external_parser(parser) func, external_parser_args = tp.main(argv[1:]) if func.__name__.split('action_')[-1] != 'create': return args = external_parser_args[0] validate_args(args) cluster = tp.get_cluster() # Specify experiment specific flags here. exp_flags = [] hyper_configs = [] exps = [] for work_id, params in enumerate( hyper.product( hyper.discrete('env_config.k', [10, 25]), hyper.discrete('agent_config.lr_init', [1e-4, 2e-4]), # hyper.discrete('agent_config.lr_init', [1e-4, 2e-4, 3e-4]), # hyper.discrete('env_config.graph_start_idx', list(range(8))), hyper.discrete('agent_config.ent_dec_init', [2e-2]), )): exp = cluster.new_experiment('%s-%d' % (tp.experiment_name, work_id), env_name='liaison') # start tensorboard only for the first work unit. coloc_constraints = build_program( exp, args.n_actors, ConfigDict(argon.to_nested_dicts(args.resource_req_config)), bundle_actors=args.bundle_actors, irs_placement=IRS_NODE, visualizer_placement=VISUALIZER_NODE, with_visualizers=(work_id == 0) and (not args.without_visualizers), with_evaluators=(not args.without_evaluators), without_valid_and_test_evaluators=args.without_valid_and_test_evaluators, with_irs_proxy=args.use_irs_proxy, irs_proxy_placement=IRS_PROXY_NODE) exp_flag = ['--work_id', str(work_id)] exp_flag += ['--hyper_configs', str(shlex.quote(json.dumps(params)))] exp_flag += hyper.to_commandline(params) exps.append(exp) exp_flags.append(exp_flag) hyper_configs.append(params) if args.disable_sweep: break exp_procs = [[proc for pg in exp.list_process_groups() for proc in pg.list_processes()] + [proc for proc in exp.list_processes()] for exp in exps] print('-----------exp stats-------------') print('Number of work units: %d' % len(exps)) print('Number of processes total: %d' % sum(map(len, exp_procs))) placer = LiaisonPlacer( tp.exp_id, exps, ConfigDict(argon.to_nested_dicts(args.cluster_config)), args.filter_nodes_regex, args.whitelist_nodes, args.spy_measurement_interval, pl_constraints=list(map(lambda k: k.split(':'), args.pl_constraints)), coloc_constraints=list( map(lambda k: k.split(';'), coloc_constraints + args.coloc_constraints)), gpu_overload_obj_coeff=args.gpu_overload_obj_coeff, gpu_load_balancing_obj_coeff=args.gpu_load_balancing_obj_coeff, gpu_wu_consolidation_obj_coeff=args.gpu_wu_consolidation_obj_coeff, cpu_overload_obj_coeff=args.cpu_overload_obj_coeff, cpu_load_balancing_obj_coeff=args.cpu_load_balancing_obj_coeff, cpu_wu_consolidation_obj_coeff=args.cpu_wu_consolidation_obj_coeff, slurm_colocate_wunit=args.slurm_colocate_wunit, slurm_per_gpu_allocation=args.slurm_per_gpu_allocation, ) tp.launch(exps, exp_flags, hyper_configs)