Beispiel #1
0
def evaluate(net, save_domains=False, baseline=None):
    test_env = SubprocVecEnv([
        lambda: gym.make('SysAdmin-v0', save_domain=save_domains)
        for i in range(config.eval_batch)
    ],
                             in_series=(config.eval_batch // config.cpus),
                             context='fork')
    tqdm_val = tqdm(desc='Validating',
                    total=config.eval_problems,
                    unit=' problems')

    with torch.no_grad():
        net.eval()

        r_tot = 0.
        problems_finished = 0.
        rewards = []
        steps = 0

        s = test_env.reset()

        while problems_finished < config.eval_problems:
            steps += 1

            if not baseline:
                a, v, pi, pi_full = net(s)
            else:
                a = random_action(s, baseline, config.multi)

            s, r, d, i = test_env.step(a)

            r_tot += np.sum(r)
            problems_finished += np.sum(d)
            rewards += [x['reward_total'] for x in itertools.compress(i, d)]

            tqdm_val.update(np.sum(d))

        r_avg_ps = r_tot / (steps * config.eval_batch
                            )  # average reward per step
        r_avg_pp = r_tot / problems_finished  # average reward per problem

        net.train()

    if args.print_raw:
        rew_mean = np.mean(rewards)
        rew_ci95 = 1.96 * scipy.stats.sem(rewards)
        print(f"{rew_mean:.2f} ± {rew_ci95:.2f}")

    tqdm_val.close()
    test_env.close()

    eval_log = {
        'reward_per_step': r_avg_ps,
        'reward_per_problem': r_avg_pp,
        'rewards': rewards,
        'problems_finished': problems_finished,
    }

    return eval_log
Beispiel #2
0
def evaluate(net, split='valid', subset=None):
	test_env = SubprocVecEnv([lambda: gym.make('Sokograph-v0', split=split, subset=subset) for i in range(config.eval_batch)], in_series=(config.eval_batch // config.cpus), context='fork')
	tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' steps')

	with torch.no_grad():
		net.eval()

		r_tot = 0.
		problems_solved = 0
		problems_finished = 0
		steps = 0

		s = test_env.reset()

		while problems_finished < config.eval_problems:
			steps += 1

			a, n, v, pi = net(s)
			actions = to_action(a, n, s, size=config.soko_size)

			s, r, d, i = test_env.step(actions)

			# print(r)
			r_tot += np.sum(r)
			problems_solved   += sum('all_boxes_on_target' in x and x['all_boxes_on_target'] == True for x in i)
			problems_finished += np.sum(d)

			tqdm_val.update()

		r_avg = r_tot / (steps * config.eval_batch) # average reward per step
		problems_solved_ps  = problems_solved / (steps * config.eval_batch)
		problems_solved_avg = problems_solved / problems_finished

		net.train()

	tqdm_val.close()
	test_env.close()

	return r_avg, problems_solved_ps, problems_solved_avg, problems_finished
Beispiel #3
0
def evaluate(net, planner):
    test_env = SubprocVecEnv([
        lambda: gym.make('Boxworld-v0', plan=planner)
        for i in range(config.eval_batch)
    ],
                             in_series=(config.eval_batch // config.cpus),
                             context='fork')
    tqdm_val = tqdm(desc='Validating',
                    total=config.eval_problems,
                    unit=' problems')

    with torch.no_grad():
        net.eval()

        r_tot = 0.
        problems_solved = 0.
        problems_finished = 0.
        problems_timeout = 0.
        steps = 0

        opt_all = []
        opt_solved = []

        s = test_env.reset()

        while problems_finished < config.eval_problems:
            steps += 1
            # for step in range(1e9):
            a, v, pi = net(s)
            s, r, d, i = test_env.step(a)

            # print(r)
            r_tot += np.sum(r)
            problems_solved += np.array(
                sum(x['d_true'] for x in i)
            )  # conversion to numpy for easier ZeroDivision handling (-> nan)
            problems_finished += np.sum(d)

            if planner is not None:
                # print([x['path_len'] / x['steps'] if x['d_true'] else 0. for x in i if x['done']])
                opt_all += [
                    x['path_len'] / x['steps'] if x['d_true'] else 0.
                    for x in i if x['done']
                ]
                opt_solved += [
                    x['path_len'] / x['steps'] for x in i if x['d_true']
                ]

            tqdm_val.update(np.sum(d))

        problems_solved_ps = problems_solved / (steps * config.eval_batch)
        problems_solved_avg = problems_solved / problems_finished

        r_avg_ps = r_tot / (steps * config.eval_batch
                            )  # average reward per step
        r_avg_pp = r_tot / problems_finished  # average reward per problem

        opt_all_avg = np.mean(opt_all)
        opt_all_sem = scipy.stats.sem(opt_all)

        opt_solved_avg = np.mean(opt_solved)
        opt_solved_sem = scipy.stats.sem(opt_solved)

        avg_steps_to_solve = (steps * config.eval_batch) / problems_finished

        net.train()

    tqdm_val.close()
    test_env.close()

    eval_log = {
        'reward_per_step': r_avg_ps,
        'reward_per_problem': r_avg_pp,
        'problems_solved': problems_solved_avg,
        'problems_finished': problems_finished,
        'solved_per_step': problems_solved_ps,
        'steps_per_problem': avg_steps_to_solve,
        'optimality_all': opt_all_avg,
        'optimality_all_sem': opt_all_sem,
        'optimality_solved': opt_solved_avg,
        'optimality_solved_sem': opt_solved_sem,
    }

    return eval_log
Beispiel #4
0
            eval_log = evaluate(net, planner)
            # debug_net(net)

            log = {
                'env_steps': tot_env_steps,
                'rate': tqdm_main.format_dict['rate'],
                'loss': loss,
                'loss_pi': loss_pi,
                'loss_v': loss_v,
                'loss_h': loss_h,
                'entropy estimate': entropy,
                'gradient norm': norm,
                'value': v.mean(),
                'lr': net.lr,
                'alpha_h': net.alpha_h,
            }

            print(log, eval_log)
            wandb.log(log, commit=False)
            wandb.log(eval_log)

            # save model to wandb
            net.save(os.path.join(wandb.run.dir, "model.pt"))

        # finish if max_epochs exceeded
        if config.max_epochs and (step // config.log_rate >=
                                  config.max_epochs):
            break

    env.close()
    tqdm_main.close()