def evaluate(net, save_domains=False, baseline=None): test_env = SubprocVecEnv([ lambda: gym.make('SysAdmin-v0', save_domain=save_domains) for i in range(config.eval_batch) ], in_series=(config.eval_batch // config.cpus), context='fork') tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' problems') with torch.no_grad(): net.eval() r_tot = 0. problems_finished = 0. rewards = [] steps = 0 s = test_env.reset() while problems_finished < config.eval_problems: steps += 1 if not baseline: a, v, pi, pi_full = net(s) else: a = random_action(s, baseline, config.multi) s, r, d, i = test_env.step(a) r_tot += np.sum(r) problems_finished += np.sum(d) rewards += [x['reward_total'] for x in itertools.compress(i, d)] tqdm_val.update(np.sum(d)) r_avg_ps = r_tot / (steps * config.eval_batch ) # average reward per step r_avg_pp = r_tot / problems_finished # average reward per problem net.train() if args.print_raw: rew_mean = np.mean(rewards) rew_ci95 = 1.96 * scipy.stats.sem(rewards) print(f"{rew_mean:.2f} ± {rew_ci95:.2f}") tqdm_val.close() test_env.close() eval_log = { 'reward_per_step': r_avg_ps, 'reward_per_problem': r_avg_pp, 'rewards': rewards, 'problems_finished': problems_finished, } return eval_log
def evaluate(net, split='valid', subset=None): test_env = SubprocVecEnv([lambda: gym.make('Sokograph-v0', split=split, subset=subset) for i in range(config.eval_batch)], in_series=(config.eval_batch // config.cpus), context='fork') tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' steps') with torch.no_grad(): net.eval() r_tot = 0. problems_solved = 0 problems_finished = 0 steps = 0 s = test_env.reset() while problems_finished < config.eval_problems: steps += 1 a, n, v, pi = net(s) actions = to_action(a, n, s, size=config.soko_size) s, r, d, i = test_env.step(actions) # print(r) r_tot += np.sum(r) problems_solved += sum('all_boxes_on_target' in x and x['all_boxes_on_target'] == True for x in i) problems_finished += np.sum(d) tqdm_val.update() r_avg = r_tot / (steps * config.eval_batch) # average reward per step problems_solved_ps = problems_solved / (steps * config.eval_batch) problems_solved_avg = problems_solved / problems_finished net.train() tqdm_val.close() test_env.close() return r_avg, problems_solved_ps, problems_solved_avg, problems_finished
def evaluate(net, planner): test_env = SubprocVecEnv([ lambda: gym.make('Boxworld-v0', plan=planner) for i in range(config.eval_batch) ], in_series=(config.eval_batch // config.cpus), context='fork') tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' problems') with torch.no_grad(): net.eval() r_tot = 0. problems_solved = 0. problems_finished = 0. problems_timeout = 0. steps = 0 opt_all = [] opt_solved = [] s = test_env.reset() while problems_finished < config.eval_problems: steps += 1 # for step in range(1e9): a, v, pi = net(s) s, r, d, i = test_env.step(a) # print(r) r_tot += np.sum(r) problems_solved += np.array( sum(x['d_true'] for x in i) ) # conversion to numpy for easier ZeroDivision handling (-> nan) problems_finished += np.sum(d) if planner is not None: # print([x['path_len'] / x['steps'] if x['d_true'] else 0. for x in i if x['done']]) opt_all += [ x['path_len'] / x['steps'] if x['d_true'] else 0. for x in i if x['done'] ] opt_solved += [ x['path_len'] / x['steps'] for x in i if x['d_true'] ] tqdm_val.update(np.sum(d)) problems_solved_ps = problems_solved / (steps * config.eval_batch) problems_solved_avg = problems_solved / problems_finished r_avg_ps = r_tot / (steps * config.eval_batch ) # average reward per step r_avg_pp = r_tot / problems_finished # average reward per problem opt_all_avg = np.mean(opt_all) opt_all_sem = scipy.stats.sem(opt_all) opt_solved_avg = np.mean(opt_solved) opt_solved_sem = scipy.stats.sem(opt_solved) avg_steps_to_solve = (steps * config.eval_batch) / problems_finished net.train() tqdm_val.close() test_env.close() eval_log = { 'reward_per_step': r_avg_ps, 'reward_per_problem': r_avg_pp, 'problems_solved': problems_solved_avg, 'problems_finished': problems_finished, 'solved_per_step': problems_solved_ps, 'steps_per_problem': avg_steps_to_solve, 'optimality_all': opt_all_avg, 'optimality_all_sem': opt_all_sem, 'optimality_solved': opt_solved_avg, 'optimality_solved_sem': opt_solved_sem, } return eval_log
eval_log = evaluate(net, planner) # debug_net(net) log = { 'env_steps': tot_env_steps, 'rate': tqdm_main.format_dict['rate'], 'loss': loss, 'loss_pi': loss_pi, 'loss_v': loss_v, 'loss_h': loss_h, 'entropy estimate': entropy, 'gradient norm': norm, 'value': v.mean(), 'lr': net.lr, 'alpha_h': net.alpha_h, } print(log, eval_log) wandb.log(log, commit=False) wandb.log(eval_log) # save model to wandb net.save(os.path.join(wandb.run.dir, "model.pt")) # finish if max_epochs exceeded if config.max_epochs and (step // config.log_rate >= config.max_epochs): break env.close() tqdm_main.close()