Esempio n. 1
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    # config["eval_env"]["id"] = config["env"]["id"]

    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        eval_env_kwargs=config["env"],
        **config["sampler"]
    )
    algo = SAC(optim_kwargs=config["optim"], **config["algo"])
    agent = SacAgent(**config["agent"])
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = "sac_" + config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Esempio n. 2
0
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=dict(id=env_id),
        eval_env_kwargs=dict(id=env_id),
        batch_T=1,  # One time-step per sampler iteration.
        batch_B=1,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(51e3),
        eval_max_trajectories=50,
    )
    algo = SAC()  # Run with defaults.
    agent = SacAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e4,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(env_id=env_id)
    name = "sac_" + env_id
    log_dir = "example_2"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Esempio n. 3
0
def build_and_train():
    p = psutil.Process()
    cpus = p.cpu_affinity()
    affinity = dict(cuda_idx=None,
                    master_cpus=cpus,
                    workers_cpus=list([x] for x in cpus),
                    set_affinity=True)
    sampler = CpuSampler(EnvCls=_make_env,
                         env_kwargs=dict(rank=0),
                         batch_T=1,
                         batch_B=4,
                         max_decorrelation_steps=0,
                         CollectorCls=CpuResetCollector)
    algo = SAC(batch_size=256,
               min_steps_learn=10000,
               replay_size=1000000,
               replay_ratio=256 / 4,
               target_update_interval=1,
               target_entropy=-9,
               target_update_tau=0.01,
               learning_rate=0.00025,
               action_prior="uniform",
               reward_scale=1,
               reparameterize=True,
               clip_grad_norm=1e9,
               n_step_return=1,
               updates_per_sync=1,
               bootstrap_timelimit=False)  # Run with defaults.
    agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]})
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=10000,
        affinity=affinity,
    )
    config = dict(env_id='picking')
    name = "sac_rlpyt_picking"
    log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking")
    with logger_context(log_dir,
                        0,
                        name,
                        config,
                        use_summary_writer=True,
                        snapshot_mode='all'):
        runner.train()
Esempio n. 4
0
def build_and_train(env_id="Cassie-v0", run_ID=0, cuda_idx=None, snapshot_file=None):

    if snapshot_file is None:
        initial_optim_state_dict = None
        initial_model_state_dict = None
    else:
        snapshot = torch.load(snapshot_file)
        initial_optim_state_dict=snapshot['optimizer_state_dict']
        initial_model_state_dict=snapshot['agent_state_dict']

    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=dict(id=env_id,
            xml_file=get_full_path('resources/cassie.xml')),
        eval_env_kwargs=dict(id=env_id,
            xml_file=get_full_path('resources/cassie.xml')),
        batch_T=1,  # One time-step per sampler iteration.
        batch_B=1,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=1,
        eval_max_steps=int(1000),
        eval_max_trajectories=50, # 50
    )
    algo = SAC(
            initial_optim_state_dict=initial_optim_state_dict)
    agent = SacAgent(
            initial_model_state_dict=initial_model_state_dict)
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=5e4, #5e4
        affinity=dict(cuda_idx=cuda_idx),
    )
    other_param = dict(
            env_id=env_id,
            forward_reward_weight=0,
            shift_cost=True,
            cum_steps='1M')
    name = "sac_" + env_id
    log_dir = "Cassie_stand"
    with logger_context(log_dir, run_ID, name, other_param, snapshot_mode='last', use_summary_writer=True):
        runner.train()
Esempio n. 5
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    print('Variant', variant)
    config = update_config(config, variant)

    sampler = SerialSampler(EnvCls=DMControlEnv,
                            env_kwargs=config["env"],
                            CollectorCls=CpuResetCollector,
                            eval_env_kwargs=config["eval_env"],
                            **config["sampler"])
    algo = SAC(optim_kwargs=config["optim"], **config["algo"])
    agent = SacAgent(**config["agent"])
    runner = MinibatchRlEval(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             affinity=affinity,
                             **config["runner"])
    name = "sac_{}_{}".format(config['env']['domain'], config['env']['task'])
    with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
        runner.train()
Esempio n. 6
0
def build_and_train():
    affinity = dict(cuda_idx=None, workers_cpus=list(range(15)))
    sampler = CpuSampler(
        EnvCls=_make_env,
        env_kwargs=dict(rank=0),
        batch_T=6000,
        batch_B=20,
    )
    algo = SAC(bootstrap_timelimit=False)  # Run with defaults.
    agent = SacAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=600,
        affinity=affinity,
    )
    config = dict(env_id='reaching')
    name = "sac_reaching"
    log_dir = os.path.join(os.path.dirname(__file__), "example")
    with logger_context(log_dir, 0, name, config, use_summary_writer=True):
        runner.train()
Esempio n. 7
0
def build_and_train(env_id="HalfCheetah-v2",
                    run_ID=0,
                    cuda_idx=None,
                    sample_mode='cpu',
                    n_parallel=2,
                    eval=False,
                    wandb_log=True,
                    log_interval_steps=1e5,
                    n_steps=50e6):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}"
    if sample_mode == "serial":
        Sampler = SerialSampler  # (Ignores workers_cpus.)
        if eval:
            eval_collector_cl = SerialEvalCollector
        else:
            eval_collector_cl = None

        print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.")
    elif sample_mode == "cpu":
        Sampler = CpuSampler
        if eval:
            eval_collector_cl = CpuEvalCollector
        else:
            eval_collector_cl = None

        print(
            f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing."
        )

    num_envs = 8
    b_size = 20

    env_kwargs = dict(id=env_id)
    if eval:
        eval_env_kwargs = env_kwargs
        eval_max_steps = 1e4
        num_eval_envs = num_envs
    else:
        eval_env_kwargs = None
        eval_max_steps = None
        num_eval_envs = 0

    sampler = Sampler(
        EnvCls=gym_make,
        env_kwargs=env_kwargs,
        eval_env_kwargs=eval_env_kwargs,
        batch_T=b_size,  # One time-step per sampler iteration.
        batch_B=num_envs,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=num_eval_envs,
        eval_CollectorCls=eval_collector_cl,
        eval_max_steps=eval_max_steps,
    )
    #         eval_max_trajectories=50,
    #     )
    algo = SAC()  # Run with defaults.
    agent = SacAgent()

    if eval:
        RunnerCl = MinibatchRlEval
    else:
        RunnerCl = MinibatchRl

    runner = RunnerCl(algo=algo,
                      agent=agent,
                      sampler=sampler,
                      n_steps=n_steps,
                      log_interval_steps=log_interval_steps,
                      affinity=affinity,
                      wandb_log=True)
    config = dict(env_id=env_id)
    name = "sac_" + env_id
    log_dir = "baseline_run"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Esempio n. 8
0
def build_and_train():
    opt_affinities = list()
    opt_affinity = dict(cpus=[0],
                        cuda_idx=None,
                        torch_threads=1,
                        set_affinity=True)
    opt_affinities.append(opt_affinity)
    smp_affinity = AttrDict(
        all_cpus=[0, 1],
        master_cpus=[0],
        workers_cpus=[1],
        master_torch_threads=1,
        worker_torch_threads=1,
        cuda_idx=None,
        alternating=False,  # Just to pass through a check.
        set_affinity=True,
    )
    affinity = AttrDict(
        all_cpus=[0, 1],  # For exp launcher to use taskset.
        optimizer=opt_affinities,
        sampler=smp_affinity,
        set_affinity=True,
    )
    sampler = AsyncCpuSampler(EnvCls=_make_env,
                              env_kwargs=dict(rank=0),
                              batch_T=600,
                              batch_B=3,
                              max_decorrelation_steps=0,
                              CollectorCls=DbCpuResetCollector)
    algo = SAC(batch_size=256,
               min_steps_learn=10000,
               replay_size=1000000,
               replay_ratio=1,
               target_update_interval=1,
               target_entropy=-9,
               target_update_tau=0.01,
               learning_rate=0.00025,
               action_prior="uniform",
               reward_scale=1,
               reparameterize=True,
               clip_grad_norm=1e9,
               n_step_return=1,
               updates_per_sync=1,
               bootstrap_timelimit=False)  # Run with defaults.
    agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]})
    runner = AsyncRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=10000,
        affinity=affinity,
    )
    config = dict(env_id='picking')
    name = "sac_rlpyt_picking"
    log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking")
    with logger_context(log_dir,
                        0,
                        name,
                        config,
                        use_summary_writer=False,
                        snapshot_mode='all'):
        runner.train()
Esempio n. 9
0
def build_and_train(game="cartpole",
                    run_ID=0,
                    cuda_idx=None,
                    sample_mode="serial",
                    n_parallel=2,
                    eval=False,
                    serial=False,
                    train_mask=[True, True],
                    wandb_log=False,
                    save_models_to_wandb=False,
                    log_interval_steps=1e5,
                    observation_mode="agent",
                    inc_player_last_act=False,
                    alt_train=False,
                    eval_perf=False,
                    n_steps=50e6,
                    one_agent=False):
    # def envs:
    if observation_mode == "agent":
        fully_obs = False
        rand_obs = False
    elif observation_mode == "random":
        fully_obs = False
        rand_obs = True
    elif observation_mode == "full":
        fully_obs = True
        rand_obs = False

    n_serial = None
    if game == "cartpole":
        work_env = gym.make
        env_name = 'CartPole-v1'
        cont_act = False
        state_space_low = np.asarray([
            0.0, 0.0, 0.0, 0.0, -4.8000002e+00, -3.4028235e+38, -4.1887903e-01,
            -3.4028235e+38
        ])
        state_space_high = np.asarray([
            1.0, 1.0, 1.0, 1.0, 4.8000002e+00, 3.4028235e+38, 4.1887903e-01,
            3.4028235e+38
        ])
        obs_space = Box(state_space_low, state_space_high, dtype=np.float32)
        player_act_space = work_env(env_name).action_space
        player_act_space.shape = (1, )
        print(player_act_space)
        if inc_player_last_act:
            observer_obs_space = Box(np.append(state_space_low, 0),
                                     np.append(state_space_high, 1),
                                     dtype=np.float32)
        else:
            observer_obs_space = obs_space
        player_reward_shaping = player_reward_shaping_cartpole
        observer_reward_shaping = observer_reward_shaping_cartpole
        max_decor_steps = 20
        b_size = 20
        num_envs = 8
        max_episode_length = np.inf
        player_model_kwargs = dict(hidden_sizes=[24],
                                   lstm_size=16,
                                   nonlinearity=torch.nn.ReLU,
                                   normalize_observation=False,
                                   norm_obs_clip=10,
                                   norm_obs_var_clip=1e-6)
        observer_model_kwargs = dict(hidden_sizes=[64],
                                     lstm_size=16,
                                     nonlinearity=torch.nn.ReLU,
                                     normalize_observation=False,
                                     norm_obs_clip=10,
                                     norm_obs_var_clip=1e-6)

    elif game == "hiv":
        work_env = wn.gym.make
        env_name = 'HIV-v0'
        cont_act = False
        state_space_low = np.asarray(
            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
        state_space_high = np.asarray([
            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, np.inf, np.inf, np.inf, np.inf,
            np.inf, np.inf
        ])
        obs_space = Box(state_space_low, state_space_high, dtype=np.float32)
        player_act_space = work_env(env_name).action_space
        if inc_player_last_act:
            observer_obs_space = Box(np.append(state_space_low, 0),
                                     np.append(state_space_high, 3),
                                     dtype=np.float32)
        else:
            observer_obs_space = obs_space
        player_reward_shaping = player_reward_shaping_hiv
        observer_reward_shaping = observer_reward_shaping_hiv
        max_decor_steps = 10
        b_size = 32
        num_envs = 8
        max_episode_length = 100
        player_model_kwargs = dict(hidden_sizes=[32],
                                   lstm_size=16,
                                   nonlinearity=torch.nn.ReLU,
                                   normalize_observation=False,
                                   norm_obs_clip=10,
                                   norm_obs_var_clip=1e-6)
        observer_model_kwargs = dict(hidden_sizes=[64],
                                     lstm_size=16,
                                     nonlinearity=torch.nn.ReLU,
                                     normalize_observation=False,
                                     norm_obs_clip=10,
                                     norm_obs_var_clip=1e-6)

    elif game == "heparin":
        work_env = HeparinEnv
        env_name = 'Heparin-Simulator'
        cont_act = False
        state_space_low = np.asarray([
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 18728.926, 72.84662, 0.0, 0.0,
            0.0, 0.0, 0.0
        ])
        state_space_high = np.asarray([
            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.7251439e+04, 1.0664291e+02,
            200.0, 8.9383472e+02, 1.0025734e+02, 1.5770737e+01, 4.7767456e+01
        ])
        # state_space_low = np.asarray([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18728.926,72.84662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
        # state_space_high = np.asarray([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.7251439e+04,1.0664291e+02,0.0000000e+00,8.9383472e+02,1.4476662e+02,1.3368750e+02,1.6815166e+02,1.0025734e+02,1.5770737e+01,4.7767456e+01,7.7194958e+00])
        obs_space = Box(state_space_low, state_space_high, dtype=np.float32)
        player_act_space = work_env(env_name).action_space
        if inc_player_last_act:
            observer_obs_space = Box(np.append(state_space_low, 0),
                                     np.append(state_space_high, 4),
                                     dtype=np.float32)
        else:
            observer_obs_space = obs_space
        player_reward_shaping = player_reward_shaping_hep
        observer_reward_shaping = observer_reward_shaping_hep
        max_decor_steps = 3
        b_size = 20
        num_envs = 8
        max_episode_length = 20
        player_model_kwargs = dict(hidden_sizes=[32],
                                   lstm_size=16,
                                   nonlinearity=torch.nn.ReLU,
                                   normalize_observation=False,
                                   norm_obs_clip=10,
                                   norm_obs_var_clip=1e-6)
        observer_model_kwargs = dict(hidden_sizes=[128],
                                     lstm_size=16,
                                     nonlinearity=torch.nn.ReLU,
                                     normalize_observation=False,
                                     norm_obs_clip=10,
                                     norm_obs_var_clip=1e-6)

    elif game == "halfcheetah":
        assert not serial
        assert not one_agent
        work_env = gym.make
        env_name = 'HalfCheetah-v2'
        cont_act = True
        temp_env = work_env(env_name)
        state_space_low = np.concatenate([
            np.zeros(temp_env.observation_space.low.shape),
            temp_env.observation_space.low
        ])
        state_space_high = np.concatenate([
            np.ones(temp_env.observation_space.high.shape),
            temp_env.observation_space.high
        ])
        obs_space = Box(state_space_low, state_space_high, dtype=np.float32)
        player_act_space = temp_env.action_space
        if inc_player_last_act:
            observer_obs_space = Box(np.append(state_space_low, 0),
                                     np.append(state_space_high, 4),
                                     dtype=np.float32)
        else:
            observer_obs_space = obs_space
        player_reward_shaping = None
        observer_reward_shaping = None
        temp_env.close()
        max_decor_steps = 0
        b_size = 20
        num_envs = 8
        max_episode_length = np.inf
        player_model_kwargs = dict(hidden_sizes=[256, 256])
        observer_model_kwargs = dict(hidden_sizes=[256, 256])
        player_q_model_kwargs = dict(hidden_sizes=[256, 256])
        observer_q_model_kwargs = dict(hidden_sizes=[256, 256])
        player_v_model_kwargs = dict(hidden_sizes=[256, 256])
        observer_v_model_kwargs = dict(hidden_sizes=[256, 256])
    if game == "halfcheetah":
        observer_act_space = Box(
            low=state_space_low[:int(len(state_space_low) / 2)],
            high=state_space_high[:int(len(state_space_high) / 2)])
    else:
        if serial:
            n_serial = int(len(state_space_high) / 2)
            observer_act_space = Discrete(2)
            observer_act_space.shape = (1, )
        else:
            if one_agent:
                observer_act_space = IntBox(
                    low=0,
                    high=player_act_space.n *
                    int(2**int(len(state_space_high) / 2)))
            else:
                observer_act_space = IntBox(low=0,
                                            high=int(2**int(
                                                len(state_space_high) / 2)))

    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}"
    if sample_mode == "serial":
        alt = False
        Sampler = SerialSampler  # (Ignores workers_cpus.)
        if eval:
            eval_collector_cl = SerialEvalCollector
        else:
            eval_collector_cl = None
        print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.")
    elif sample_mode == "cpu":
        alt = False
        Sampler = CpuSampler
        if eval:
            eval_collector_cl = CpuEvalCollector
        else:
            eval_collector_cl = None
        print(
            f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing."
        )
    env_kwargs = dict(work_env=work_env,
                      env_name=env_name,
                      obs_spaces=[obs_space, observer_obs_space],
                      action_spaces=[player_act_space, observer_act_space],
                      serial=serial,
                      player_reward_shaping=player_reward_shaping,
                      observer_reward_shaping=observer_reward_shaping,
                      fully_obs=fully_obs,
                      rand_obs=rand_obs,
                      inc_player_last_act=inc_player_last_act,
                      max_episode_length=max_episode_length,
                      cont_act=cont_act)
    if eval:
        eval_env_kwargs = env_kwargs
        eval_max_steps = 1e4
        num_eval_envs = num_envs
    else:
        eval_env_kwargs = None
        eval_max_steps = None
        num_eval_envs = 0
    sampler = Sampler(
        EnvCls=CWTO_EnvWrapper,
        env_kwargs=env_kwargs,
        batch_T=b_size,
        batch_B=num_envs,
        max_decorrelation_steps=max_decor_steps,
        eval_n_envs=num_eval_envs,
        eval_CollectorCls=eval_collector_cl,
        eval_env_kwargs=eval_env_kwargs,
        eval_max_steps=eval_max_steps,
    )
    if game == "halfcheetah":
        player_algo = SAC()
        observer_algo = SACBeta()
        player = SacAgent(ModelCls=PiMlpModel,
                          QModelCls=QofMuMlpModel,
                          model_kwargs=player_model_kwargs,
                          q_model_kwargs=player_q_model_kwargs,
                          v_model_kwargs=player_v_model_kwargs)
        observer = SacAgentBeta(ModelCls=PiMlpModelBeta,
                                QModelCls=QofMuMlpModel,
                                model_kwargs=observer_model_kwargs,
                                q_model_kwargs=observer_q_model_kwargs,
                                v_model_kwargs=observer_v_model_kwargs)
    else:
        player_model = CWTO_LstmModel
        observer_model = CWTO_LstmModel

        player_algo = PPO()
        observer_algo = PPO()
        player = CWTO_LstmAgent(ModelCls=player_model,
                                model_kwargs=player_model_kwargs,
                                initial_model_state_dict=None)
        observer = CWTO_LstmAgent(ModelCls=observer_model,
                                  model_kwargs=observer_model_kwargs,
                                  initial_model_state_dict=None)
    if one_agent:
        agent = CWTO_AgentWrapper(player,
                                  observer,
                                  serial=serial,
                                  n_serial=n_serial,
                                  alt=alt,
                                  train_mask=train_mask,
                                  one_agent=one_agent,
                                  nplayeract=player_act_space.n)
    else:
        agent = CWTO_AgentWrapper(player,
                                  observer,
                                  serial=serial,
                                  n_serial=n_serial,
                                  alt=alt,
                                  train_mask=train_mask)

    if eval:
        RunnerCl = MinibatchRlEval
    else:
        RunnerCl = MinibatchRl

    runner = RunnerCl(player_algo=player_algo,
                      observer_algo=observer_algo,
                      agent=agent,
                      sampler=sampler,
                      n_steps=n_steps,
                      log_interval_steps=log_interval_steps,
                      affinity=affinity,
                      wandb_log=wandb_log,
                      alt_train=alt_train)
    config = dict(domain=game)
    if game == "halfcheetah":
        name = "sac_" + game
    else:
        name = "ppo_" + game
    log_dir = os.getcwd() + "/cwto_logs/" + name
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
    if save_models_to_wandb:
        agent.save_models_to_wandb()
    if eval_perf:
        eval_n_envs = 10
        eval_envs = [CWTO_EnvWrapper(**env_kwargs) for _ in range(eval_n_envs)]
        set_envs_seeds(eval_envs, make_seed())
        eval_collector = SerialEvalCollector(envs=eval_envs,
                                             agent=agent,
                                             TrajInfoCls=TrajInfo_obs,
                                             max_T=1000,
                                             max_trajectories=10,
                                             log_full_obs=True)
        traj_infos_player, traj_infos_observer = eval_collector.collect_evaluation(
            runner.get_n_itr())
        observations = []
        player_actions = []
        returns = []
        observer_actions = []
        for traj in traj_infos_player:
            observations.append(np.array(traj.Observations))
            player_actions.append(np.array(traj.Actions))
            returns.append(traj.Return)
        for traj in traj_infos_observer:
            observer_actions.append(
                np.array([
                    obs_action_translator(act, eval_envs[0].power_vec,
                                          eval_envs[0].obs_size)
                    for act in traj.Actions
                ]))

        # save results:
        open_obs = open('eval_observations.pkl', "wb")
        pickle.dump(observations, open_obs)
        open_obs.close()
        open_ret = open('eval_returns.pkl', "wb")
        pickle.dump(returns, open_ret)
        open_ret.close()
        open_pact = open('eval_player_actions.pkl', "wb")
        pickle.dump(player_actions, open_pact)
        open_pact.close()
        open_oact = open('eval_observer_actions.pkl', "wb")
        pickle.dump(observer_actions, open_oact)
        open_oact.close()