Esempio n. 1
0
def test(flags, num_episodes: int = 10):
    if flags.xpid is None:
        checkpointpath = "./latest/model.tar"
    else:
        checkpointpath = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" %
                               (flags.savedir, flags.xpid, "model.tar")))

    gym_env = create_env(flags)
    env = environment.Environment(gym_env)
    model = Net(gym_env.observation_space.shape, gym_env.action_space.n,
                flags.use_lstm)
    model.eval()
    checkpoint = torch.load(checkpointpath, map_location="cpu")
    model.load_state_dict(checkpoint["model_state_dict"])

    observation = env.initial()
    returns = []

    while len(returns) < num_episodes:
        if flags.mode == "test_render":
            env.gym_env.render()
        agent_outputs = model(observation)
        policy_outputs, _ = agent_outputs
        observation = env.step(policy_outputs["action"])
        if observation["done"].item():
            returns.append(observation["episode_return"].item())
            logging.info(
                "Episode ended after %d steps. Return: %.1f",
                observation["episode_step"].item(),
                observation["episode_return"].item(),
            )
    env.close()
    logging.info("Average returns over %i steps: %.1f", num_episodes,
                 sum(returns) / len(returns))
Esempio n. 2
0
def test_1(flags, model, num_episodes: int = 10):
    gym_env = create_env(flags)
    env = environment.Environment(gym_env)

    observation = env.initial()
    returns = []
    hidden_state = model.initial_state(batch_size=1)

    while len(returns) < num_episodes:
        if flags.mode == "test_render":
            env.gym_env.render()
        agent_outputs, new_hidden_state = model(observation, hidden_state)
        hidden_state = new_hidden_state
        policy_outputs = agent_outputs
        observation = env.step(policy_outputs["action"])
        if observation["done"].item():
            returns.append(observation["episode_return"].item())
            logging.info(
                "Episode ended after %d steps. Return: %.1f",
                observation["episode_step"].item(),
                observation["episode_return"].item(),
            )
            hidden_state = model.initial_state(batch_size=1)
    env.close()
    return np.mean(returns), np.std(returns)
def act(flags, actor_index: int, free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers,
        initial_agent_state_buffers, level_name):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        ######changed next line
        gym_env = create_env(flags, level_name, seed)
        env = environment.Environment(gym_env)
        env_output = env.initial()
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                timings.time("model")

                env_output = env.step(agent_output["action"])

                timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
def test(flags, num_episodes: int = 10):
    if flags.xpid is None:
        checkpointpath = "./latest/model.tar"
    else:
        checkpointpath = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))
        )

    gym_env = create_env(flags)
    env = environment.Environment(gym_env)
    model = Net(num_actions=env.action_space.n)
    model.eval()
    checkpoint = torch.load(checkpointpath, map_location="cpu")
    model.load_state_dict(checkpoint["model_state_dict"])

    observation = env.initial()
    returns = []

    video_frames = []
    attention_frames = []

    hidden_state = model.initial_state(batch_size=1)

    while len(returns) < num_episodes:
        if flags.mode == "test_render":
            env.gym_env.render()
        agent_outputs, new_hidden_state = model(observation, hidden_state)
        hidden_state = new_hidden_state
        policy_outputs, _ = agent_outputs
        observation = env.step(policy_outputs["action"])
        if observation["done"].item():
            returns.append(observation["episode_return"].item())
            logging.info(
                "Episode ended after %d steps. Return: %.1f",
                observation["episode_step"].item(),
                observation["episode_return"].item(),
            )
            hidden_state = model.initial_state(batch_size=1)

    if flags.mode == "write_videos":
        # Save numpy arrays, so we can make videos somewhere else.
        video_frames = np.asarray(video_frames)
        with open(videopath, "wb") as f:
            np.save(f, video_frames)
        attention_frames = np.asarray(attention_frames)
        with open(attentionpath, "wb") as f:
            np.save(f, attention_frames)

    env.close()
    logging.info(
        "Average returns over %i steps: %.1f", num_episodes, sum(returns) / len(returns)
    )
Esempio n. 5
0
def test(flags, game_params, num_episodes: int = 10):
    if flags.xpid is None:
        raise Exception(
            "Specify a experiment id with --xpid. `latest` option not working."
        )
    else:
        checkpointpath = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" %
                               (flags.savedir, flags.xpid, "model.tar")))

    replay_dict = dict(save_replay_episodes=1,
                       replay_dir='Replays/',
                       replay_prefix=flags.map_name)
    sc_env = init_game(game_params['env'], flags.map_name, **replay_dict)
    model = IMPALA_AC(
        env=sc_env, device='cpu',
        **game_params['HPs'])  # let's use cpu as default for test
    obs_processer = IMPALA_ObsProcesser(action_table=model.action_table,
                                        **game_params['obs_processer'])
    env = environment.Environment(sc_env, obs_processer)
    model.eval()  # disable dropout
    checkpoint = torch.load(checkpointpath, map_location="cpu")
    model.load_state_dict(checkpoint["model_state_dict"])

    observation = env.initial()  # env.reset
    returns = []

    while len(returns) < num_episodes:
        with torch.no_grad():
            agent_outputs = model.actor_step(observation)
        observation = env.step(agent_outputs["sc_env_action"])
        if observation["done"].item():
            returns.append(observation["episode_return"].item())
            logging.info(
                "Episode ended after %d steps. Return: %.1f",
                observation["episode_step"].item(),
                observation["episode_return"].item(),
            )
    env.close()
    returns = np.array(returns)
    logging.info("Average returns over %i episodes: %.2f (std %.2f) ",
                 num_episodes, returns.mean(), returns.std())
    print("Saving to file")
    np.save('%s/%s/test_results' % (flags.savedir, flags.xpid), returns)
Esempio n. 6
0
def act(
    flags,
    game_params,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        sc_env = init_game(game_params['env'],
                           flags.map_name,
                           random_seed=seed)
        obs_processer = IMPALA_ObsProcesser(action_table=model.action_table,
                                            **game_params['obs_processer'])
        env = environment.Environment(sc_env, obs_processer, seed)
        # initial rollout starts here
        env_output = env.initial()
        with torch.no_grad():
            agent_output = model.actor_step(env_output)

        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                if key not in ['sc_env_action'
                               ]:  # no need to save this key on buffers
                    buffers[key][index][0, ...] = agent_output[key]

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                env_output = env.step(agent_output["sc_env_action"])

                timings.time("step")

                with torch.no_grad():
                    agent_output = model.actor_step(env_output)

                timings.time("model")

                #env_output = env.step(agent_output["sc_env_action"])

                #timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    if key not in ['sc_env_action'
                                   ]:  # no need to save this key on buffers
                        buffers[key][index][t + 1, ...] = agent_output[key]
                # env_output will be like
                # s_{0}, ..., s_{T}
                # act_mask_{0}, ..., act_mask_{T}
                # discount_{0}, ..., discount_{T}
                # r_{-1}, ..., r_{T-1}
                # agent_output will be like
                # a_0, ..., a_T with a_t ~ pi(.|s_t)
                # log_pi(a_0|s_0), ..., log_pi(a_T|s_T)
                # so the learner can use (s_i, act_mask_i) to predict log_pi_i
                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 7
0
def test(flags):
    if flags.xpid is None:
        checkpointpath = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" %
                               (flags.savedir, "latest", "model.tar")))
    elif ".tar" in flags.xpid:
        checkpointpath = os.path.expandvars(os.path.expanduser(flags.xpid))
    else:
        checkpointpath = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" %
                               (flags.savedir, flags.xpid, "model.tar")))

    if len(flags.env.split(",")) != 1:
        raise Exception("Only one environment allowed for testing")

    # load the original arguments for the loaded network
    flags_orig = file_writer.read_metadata(
        re.sub(r"model.*tar", "meta.json",
               checkpointpath).replace("/intermediate", ""))
    args_orig = flags_orig["args"]
    agent_type = args_orig.get("agent_type", "resnet")
    num_actions = args_orig.get("num_actions", 6)
    num_tasks = args_orig.get("num_tasks", 1)
    use_lstm = args_orig.get("use_lstm", False)
    use_popart = args_orig.get("use_popart", False)
    reward_clipping = args_orig.get("reward_clipping", "abs_one")
    frame_width = args_orig.get("frame_width", 84)
    frame_height = args_orig.get("frame_height", 84)
    aaa_input_format = args_orig.get("aaa_input_format", "gray_stack")

    # set the right agent class
    if agent_type.lower() in [
            "aaa", "attention_augmented", "attention_augmented_agent"
    ]:
        Net = AttentionAugmentedAgent
        logging.info("Using the Attention-Augmented Agent architecture.")
        agent_type = "aaa"
    elif agent_type.lower() in ["rn", "res", "resnet", "res_net"]:
        Net = ResNet
        logging.info("Using the ResNet architecture (monobeast version).")
        agent_type = "resnet"
    else:
        Net = AtariNet
        logging.warning(
            "No valid agent type specified. Using the default agent.")
        agent_type = "default"

    # check if the full action space should be used
    full_action_space = False
    if flags.num_actions == 18:
        full_action_space = True

    # create the environment
    gym_env = create_env(flags.env,
                         frame_height=frame_height,
                         frame_width=frame_width,
                         gray_scale=(agent_type != "aaa"
                                     or aaa_input_format == "gray_stack"),
                         full_action_space=full_action_space)
    env = environment.Environment(gym_env)

    # create the model and load its parameters
    model = Net(observation_shape=gym_env.observation_space.shape,
                num_actions=num_actions,
                num_tasks=num_tasks,
                use_lstm=use_lstm,
                use_popart=use_popart,
                reward_clipping=reward_clipping,
                rgb_last=(agent_type == "aaa"
                          and aaa_input_format == "rgb_last"))
    model.eval()
    checkpoint = torch.load(checkpointpath, map_location="cpu")
    if 'baseline.mu' not in checkpoint["model_state_dict"]:
        checkpoint["model_state_dict"]["baseline.mu"] = torch.zeros(1)
        checkpoint["model_state_dict"]["baseline.sigma"] = torch.ones(1)
    model.load_state_dict(checkpoint["model_state_dict"])

    observation = env.initial()
    returns = []
    while len(returns) < flags.num_episodes:
        if flags.mode == "test_render":
            time.sleep(0.05)
            env.gym_env.render()
        agent_outputs = model(observation)
        policy_outputs, _ = agent_outputs
        observation = env.step(policy_outputs["action"])
        if observation["done"].item():
            returns.append(observation["episode_return"].item())
            logging.info(
                "Episode ended after %d steps. Return: %.1f",
                observation["episode_step"].item(),
                observation["episode_return"].item(),
            )
    env.close()
    logging.info("Average returns over %i steps: %.1f", flags.num_episodes,
                 sum(returns) / len(returns))
Esempio n. 8
0
def act(
    flags,
    env: str,
    task: int,
    full_action_space: bool,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        # create the environment from command line parameters
        # => could also create a special one which operates on a list of games (which we need)
        gym_env = create_env(
            env,
            frame_height=flags.frame_height,
            frame_width=flags.frame_width,
            gray_scale=(flags.aaa_input_format == "gray_stack"),
            full_action_space=full_action_space,
            task=task)

        # generate a seed for the environment (NO HUMAN STARTS HERE!), could just
        # use this for all games wrapped by the environment for our application
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        gym_env.seed(seed)

        # wrap the environment, this is actually probably the point where we could
        # use multiple games, because the other environment is still one from Gym
        env = environment.Environment(gym_env)

        # get the initial frame, reward, done, return, step, last_action
        env_output = env.initial()

        # perform the first step
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            # get a buffer index from the queue for free buffers (?)
            index = free_queue.get()
            # termination signal (?) for breaking out of this loop
            if index is None:
                break

            # Write old rollout end.
            # the keys here are (frame, reward, done, episode_return, episode_step, last_action)
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            # here the keys are (policy_logits, baseline, action)
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            # I think the agent_state is just the RNN/LSTM state (which will be the "initial" state for the next step)
            # not sure why it's needed though because it really just seems to be the initial state before starting to
            # act; however, it might be randomly initialised, which is why we might want it...
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout
            for t in range(flags.unroll_length):
                timings.reset()

                # forward pass without keeping track of gradients to get the agent action
                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                timings.time("model")

                # agent acting in the environment
                env_output = env.step(agent_output["action"])

                timings.time("step")

                # writing the respective outputs of the current step (see above for the list of keys)
                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")

            # after finishing a trajectory put the index in the "full queue",
            # presumably so that the data can be processed/sent to the learner
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 9
0
def test(flags):
    num_episodes = flags.episodes
    if flags.xpid is None:
        checkpointpath = "./latest/model.tar"
    else:
        log_path = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" %
                               (flags.savedir, flags.xpid, flags.env)))
        if not os.path.exists(log_path):
            os.mkdir(log_path)
        checkpointpath = os.path.expandvars(
            os.path.expanduser("%s/%s/%s" %
                               (flags.savedir, flags.xpid, "model.tar")))

    gym_env = create_gymenv(flags)
    if flags.agent in ["CNN", "SNLM", "MHA"]:
        env = environment.Environment(gym_env, "image")
    elif flags.agent in ["NLM", "KBMLP", "GCN"]:
        env = environment.Environment(gym_env, "absVKB")
    model = create_model(flags, gym_env)
    model.eval()
    checkpoint = torch.load(checkpointpath, map_location="cpu")
    model.load_state_dict(checkpoint["model_state_dict"])

    observation = env.initial()
    returns = []

    total_steps = 0
    obs_index = 0
    if flags.store_stats:
        stats = dict(episode=[],
                     total_steps=[],
                     reward=[],
                     action=[],
                     obs_index=[])
        evals = [[] for _ in range(3)]
        obs = []
    while len(returns) < num_episodes:
        if flags.mode == "test_render":
            env.gym_env.render()
        agent_outputs = model(observation)
        policy_outputs = agent_outputs
        observation = env.step(policy_outputs["action"])

        if flags.store_stats:
            frame = observation["frame"].numpy()[0, 0]
            if not obs or np.any(obs[-1] != frame):
                if "evaluation" in policy_outputs:
                    evaluation = policy_outputs["evaluation"]
                    for i, eval in enumerate(evals):
                        eval.append(evaluation[i].detach().numpy()[0])
                obs.append(frame)
            else:
                obs_index -= 1
            stats["episode"].append(len(returns))
            stats["total_steps"].append(total_steps)
            stats["obs_index"].append(obs_index)
            stats["reward"].append(observation["reward"].numpy()[0, 0])
            stats["action"].append(policy_outputs["action"].numpy()[0, 0])
        if observation["done"].item():
            returns.append(observation["episode_return"].item())
            logging.info(
                "Episode ended after %d steps. Return: %.2f",
                observation["episode_step"].item(),
                observation["episode_return"].item(),
            )
        #index_img = vector2index_img(observation["frame"])
        #render_index_img(gym_env.get_index_img())
        #print(str(env.gym_env))
        #print("-"*15)
        #time.sleep(0.1)
        total_steps += 1
        obs_index += 1
    env.close()
    if flags.store_stats:
        if "evaluation" in policy_outputs:
            for i, eval in enumerate(evals):
                np.save(log_path + f"/eval-{i}-arity.npy", np.stack(eval))
        np.save(log_path + "/obs.npy", np.stack(obs))
        pd.DataFrame(stats).to_csv(log_path + "/stats.csv")
    mean = sum(returns) / len(returns)
    std = np.std(returns)
    logging.info("Average returns over %i steps: %.2f ± %.2f", num_episodes,
                 mean, std)
    env_name = flags.env.replace("MiniGrid-", "").replace("-v0", "")

    if flags.env in ["rtfm", "rtfm-onehop"]:
        wins = np.array(returns) > -1.0
        win_rate = np.mean(wins) * 100
        win_std = np.std(wins) * 100
        print(f"{mean:.2f} ± {std:.2f}, {win_rate:.2f} ± {win_std:.2f}")
    else:
        print(f"{mean:.2f} ± {std:.2f}")
    return mean, std
Esempio n. 10
0
def act(flags, gym_env, actor_index: int, free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue, buffers: Buffers, actor_buffers: Buffers,
        actor_model_queues: List[mp.SimpleQueue],
        actor_env_queues: List[mp.SimpleQueue]):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        gym_env = gym_env
        #seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        #gym_env.seed(seed)
        if flags.agent in ["CNN"]:
            env = environment.Environment(gym_env, "image")
        elif flags.agent in ["NLM", "KBMLP", "GCN"]:
            if flags.state in ["relative", "integer", "block"]:
                env = environment.Environment(gym_env, "VKB")
            elif flags.state == "absolute":
                env = environment.Environment(gym_env, "absVKB")
        env_output = env.initial()
        for key in env_output:
            actor_buffers[key][actor_index][0] = env_output[key]
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in actor_buffers:
                buffers[key][index][0] = actor_buffers[key][actor_index][0]

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                actor_model_queues[actor_index].put(actor_index)
                env_info = actor_env_queues[actor_index].get()
                if env_info == "exit":
                    return

                timings.time("model")

                env_output = env.step(actor_buffers["action"][actor_index][0])

                timings.time("step")

                for key in actor_buffers:
                    buffers[key][index][t +
                                        1] = actor_buffers[key][actor_index][0]
                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in env_output:
                    actor_buffers[key][actor_index][0] = env_output[key]

                timings.time("write")

            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e