def compute_baselines(environment, agent_name, checkpoint, episodes, trials):
    directory = f"./runs/{agent_name}_checkpoints"
    pretraining = {
        "latest": f"{directory}/{checkpoint}/checkpoint.zip",
        "best": "./runs/{agent_name}/best/checkpoint.zip",
        "trained_steps": int(f"{checkpoint}"),
    }

    # Baseline
    baseline_scores = []
    for _ in range(trials):
        agent, steps = make_agent("SB3_ON",
                                  environment,
                                  directory,
                                  json.loads('{"ALGO": "A2C"}'),
                                  eval_freq=100000000,
                                  n_eval_episodes=1,
                                  pretraining=pretraining,
                                  device=args.device)
        agent.load_weights(f"{directory}/{checkpoint}/checkpoint.zip")
        # agent.set_weights(weights)
        evaluator = agent.evaluator()
        baseline = evaluate(evaluator, episodes, 100000000)
        print("Baseline: ", baseline["episode_rewards"])
        baseline_scores.append(baseline["episode_rewards"])
    return sum(baseline_scores) / len(baseline_scores)
Exemple #2
0
def calc_result(offset, params, info, dir, device, key, num_steps,
                num_episodes):
    agent, steps = make_agent(info['agent_name'],
                              info['env'],
                              dir,
                              info['hyperparameters'],
                              device=device)

    weights = [p + g * offset for p, g in zip(params, dir)]
    agent.set_weights(weights)

    evaluator = agent.evaluator()
    eval_results = evaluate(evaluator, num_episodes, num_steps)

    return eval_results[key]
def main():
    parser = argparse.ArgumentParser(
        description='run a particular evaluation job')
    parser.add_argument('params', type=str)
    parser.add_argument('dir',
                        type=str,
                        help="direction to search for threshold")
    parser.add_argument('outputfile', type=str)
    parser.add_argument('--num-steps', type=int, default=bigint)
    parser.add_argument('--num-episodes', type=int, default=bigint)
    parser.add_argument('--device', type=str, default="cpu")
    parser.add_argument('--length', type=int, default=5)
    parser.add_argument('--max-magnitude', type=float, default=0.1)

    args = parser.parse_args()

    info = json.load(
        open(PurePath(args.outputfile).parent.parent / "info.json"))
    checkpoint = os.path.basename(args.outputfile)

    agent, steps = make_agent(info['agent_name'],
                              info['env'],
                              PurePath(args.params).parent.parent,
                              info['hyperparameters'],
                              device=args.device)

    params = [
        v.cpu().detach().numpy() for v in torch.load(
            args.params, map_location=torch.device('cpu')).values()
    ]
    if info['random_dir_seed'] is not None:
        seed = info['random_dir_seed']
        np.random.seed(seed + hash(args.outputfile) % (1 << 30))
        direction = [filter_normalize(p) for p in params]
    else:
        direction = readz(args.dir)

    if info['scale_dir']:
        dir_sum = sum(np.sum(x) for x in direction)
        if dir_sum != 0:
            direction = [d / dir_sum for d in direction]

    # High resolution in first segment
    for i in range(1, 10):
        mm = args.max_magnitude
        weights = [
            p + d * i * mm / (10 * args.length)
            for p, d in zip(params, direction)
        ]
        agent.set_weights(weights)
        evaluator = agent.evaluator()
        eval_results = evaluate(evaluator, args.num_episodes, args.num_steps)
        eval_results['checkpoint'] = checkpoint
        out_fname = f"{args.outputfile},0.{i}.json"
        eval_results["offset"] = i * mm / (10 * args.length)

        with open(out_fname, 'w') as file:
            file.write(json.dumps(eval_results))

    for i in range(args.length):
        mm = args.max_magnitude
        weights = [
            p + d * i * mm / args.length for p, d in zip(params, direction)
        ]
        agent.set_weights(weights)
        evaluator = agent.evaluator()
        eval_results = evaluate(evaluator, args.num_episodes, args.num_steps)
        eval_results['checkpoint'] = checkpoint
        out_fname = f"{args.outputfile},{i}.json"
        eval_results["offset"] = i * mm / args.length

        with open(out_fname, 'w') as file:
            file.write(json.dumps(eval_results))
def compare_a2c_ppo(environment, agent_name, checkpoint, episodes, trials,
                    baseline_reward):
    directory = f"./runs/{agent_name}_checkpoints"
    pretraining = {
        "latest": f"{directory}/{checkpoint}/checkpoint.zip",
        "best": "./runs/{agent_name}/best/checkpoint.zip",
        "trained_steps": int(f"{checkpoint}"),
    }

    # One step A2C
    a2c_scores = []
    for _ in range(trials):
        agent, steps = make_agent(
            "SB3_ON",
            environment,
            directory,
            json.loads(
                '{"ALGO": "A2C", "learning_rate": 0.000001, "n_steps": 128}'),
            eval_freq=100000000,
            n_eval_episodes=1,
            pretraining=pretraining,
            device=args.device)
        agent.load_weights(f"{directory}/{checkpoint}/checkpoint.zip")
        evaluator = agent.evaluator()
        agent.train(2048,
                    f"./runs/vpg/{agent_name}/{checkpoint}",
                    save_freq=10000)
        a2c = evaluate(evaluator, episodes, 100000000)
        print("A2C: ", a2c["episode_rewards"])
        a2c_scores.append(a2c["episode_rewards"])

    # One step PPO
    ppo_scores = []
    for _ in range(trials):
        agent, steps = make_agent(
            "SB3_ON",
            environment,
            directory,
            json.loads(
                '{"ALGO": "PPO", "learning_rate": 0.000001, "n_steps": 128}'),
            eval_freq=100000000,
            n_eval_episodes=1,
            pretraining=pretraining,
            device=args.device)
        evaluator = agent.evaluator()
        agent.load_weights(f"{directory}/{checkpoint}/checkpoint.zip")
        agent.train(2048,
                    f"./runs/vpg/{agent_name}/{checkpoint}",
                    save_freq=10000)
        ppo = evaluate(evaluator, episodes, 100000000)
        print("PPO: ", ppo["episode_rewards"])
        ppo_scores.append(ppo["episode_rewards"])

    # Calculate statistics
    a2c_reward = sum(a2c_scores) / len(a2c_scores)
    ppo_reward = sum(ppo_scores) / len(ppo_scores)
    ppo_percent = math.copysign(1, baseline_reward) * (
        (ppo_reward / baseline_reward) - 1)
    a2c_percent = math.copysign(1, baseline_reward) * (
        (a2c_reward / baseline_reward) - 1)

    return ppo_percent, a2c_percent
def save_results(agent, info, out_dir, results, job_name):
    if info['est_hesh']:
        print(f"estimating hesh with {info['num_steps']} steps")
        assert info[
            'num_episodes'] > 100000000, "hesh calculation only takes in steps, not episodes"
        results = agent.calculate_eigenvalues(info['num_steps'])

    if info.get('est_grad', False):
        print(f"estimating est grad with {info['num_steps']} steps")
        assert info[
            'num_episodes'] > 100000000, "calculation only takes in steps, not episodes"
        action_evalutor = agent.action_evalutor()
        loss, grad = calculate_policy_ests(action_evalutor, info['num_steps'])
        vec_folder = out_dir / f"results/{job_name}"
        os.makedirs(vec_folder, exist_ok=True)
        np.savez(vec_folder / "est_grad.npz", *grad)
        results['est_loss'] = loss

    if info['batch_grad']:
        print(
            f"computing rollout with {info['num_steps']} steps, {info['num_episodes']} episodes"
        )
        vec_folder = out_dir / f"results/{job_name}"
        os.makedirs(vec_folder, exist_ok=True)

        evaluator = agent.evaluator()
        action_evaluator = agent.action_evalutor()

        policy_grad, _ = compute_policy_gradient_batch(evaluator,
                                                       action_evaluator,
                                                       info["num_episodes"],
                                                       info["num_steps"])

        # Dumb fix, try to remove
        cpu_policy_grad = []
        for p in policy_grad:
            if isinstance(p, np.ndarray):
                cpu_policy_grad.append(p)
            else:
                cpu_policy_grad.append(p.cpu())
        np.savez(vec_folder / "grad.npz", *cpu_policy_grad)
        np.savez(vec_folder / "grad_mag.npz", *cpu_policy_grad)

    if info['calc_hesh'] or info['calc_grad']:
        print(
            f"computing rollout with {info['num_steps']} steps, {info['num_episodes']} episodes"
        )
        evaluator = agent.evaluator()
        action_evalutor = agent.action_evalutor()
        all_states, all_returns, all_actions = gather_policy_hess_data(
            evaluator,
            info['num_episodes'],
            info['num_steps'],
            action_evalutor.gamma,
            "UNUSED",
            gae_lambda=1.0)

        vec_folder = out_dir / f"results/{job_name}"
        os.makedirs(vec_folder, exist_ok=True)

    if info['calc_grad']:
        policy_grad, _ = compute_policy_gradient(action_evalutor, all_states,
                                                 all_returns, all_actions,
                                                 action_evalutor.device)
        cpu_policy_grad = []
        for p in policy_grad:
            if isinstance(p, np.ndarray):
                cpu_policy_grad.append(p)
            else:
                cpu_policy_grad.append(p.cpu())
        np.savez(vec_folder / "grad.npz", *cpu_policy_grad)
        np.savez(vec_folder / "grad_mag.npz", *cpu_policy_grad)

    if info['calc_hesh']:
        print("estimating hesh")
        maxeig, mineig, maxeigvec, mineigvec = calculate_true_hesh_eigenvalues(
            action_evalutor,
            all_states,
            all_returns,
            all_actions,
            tol=0.01,
            device=action_evalutor.device)
        results['mineig'] = mineig
        results['maxeig'] = maxeig
        results['ratio'] = mineig / max(-0.001 * mineig, maxeig)
        vec_folder = out_dir / f"results/{job_name}"
        np.savez(vec_folder / "maxeigvec.npz", *maxeigvec)
        np.savez(vec_folder / "mineigvec.npz", *mineigvec)

    if not info['calc_hesh'] and not info['est_hesh']:
        evaluator = agent.evaluator()
        eval_results = evaluate(evaluator, info['num_episodes'],
                                info['num_steps'])
        results.update(eval_results)

    print("dumping results")
    json.dump(results, open(out_dir / f"results/{job_name}.json", 'w'))