Exemple #1
0
def train_embedder(mimicpolicy,
                masterpolicy,
                env,
                scaler,
                n_episodes=100):

    # Get parameters from policy
    seq_len = mimicpolicy.seq_len
    n_units_list = mimicpolicy.n_units_list

    for i in range(n_episodes):
        observes, actions, _, _ = run_episode(env,
                                            masterpolicy,
                                            scaler,
                                            animate=False)

        state_list = []
        for n in n_units_list:
            st_c = np.zeros((1, n))
            st_h = np.zeros((1, n))
            state_list.append((st_c, st_h))

        state_list = mimicpolicy.run(observes[:seq_len], state_list)

        ep_mse = 0
        for j in range(1, int(len(observes)/seq_len)):
            mse, state_list = mimicpolicy.train(
            observes[seq_len * j:seq_len * j + seq_len], state_list)
            ep_mse += mse

        print("Episode {}/{}, mse: {}".format(i, n_episodes,
                                            ep_mse/int(len(observes)/seq_len)))
def evaluate_mimic(mimicpolicy, masterpolicy, scaler, env, n):
    total_mse = 0
    for i in range(n):
        observes, actions, _, _ = run_episode(env,
                                              masterpolicy,
                                              scaler,
                                              animate=False)
        total_mse += mimicpolicy.evaluate(observes, actions)

    return total_mse / n
    def comparemaster(self, env, masterpolicy, scaler):
        from train import run_episode
        for i in range(1):
            observes, actions, rewards, _ = run_episode(env,
                                                        masterpolicy,
                                                        scaler,
                                                        animate=False)

            for o,a,r in zip(observes, actions, rewards):
                # act, max_q = self.get_act(obs)
                act, max_q, min_q = self.get_act_sampling(o)
                oracle_act_q = self.sess.run(self.Q, {self.obs_ph:np.expand_dims(o, 0),
                                                      self.act_ph: np.expand_dims(a, 0)})[0][0]

                print("Oracle act,q : {},{}.  argmax(Q),max(Q): {},{}".format(a, oracle_act_q, act, max_q))
def train_mimic(mimicpolicy,
                masterpolicy,
                env,
                scaler,
                n_episodes=100,
                batchsize=32):

    for i in range(n_episodes):
        observes, actions, _, _ = run_episode(env,
                                              masterpolicy,
                                              scaler,
                                              animate=False)

        mse = mimicpolicy.train(observes, actions, batchsize)

        print("Episode {}/{}, mse: {}".format(i, n_episodes, mse))
def train_qnet(qnet,
                masterpolicy,
                env,
                scaler,
                n_episodes=100,
                batchsize=32):

    for i in range(n_episodes):
        observes, actions, rewards, _ = run_episode(env,
                                            masterpolicy,
                                            scaler,
                                            animate=False)

        if len(observes) <= 2*batchsize:
            continue

        mse = qnet.train(observes, actions, rewards, batchsize)
        print("Episode {}/{}, MSE: {}".format(i, n_episodes, mse))
Exemple #6
0
def rollout(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--load-model", required=True, help="Model weights")
    args = parser.parse_args(argv)

    gui = TetrisGUI()
    env, _ = train.make_env(gui, record=False)

    state_shape = env.observation_space.shape
    action_size = env.action_space.n

    agent = train.NNAgent(state_shape, action_size)

    agent.load_model(args.load_model)

    while True:
        rew, steps = train.run_episode(env, agent, True, None, 100000)
        print(f"reward: {rew} steps: {steps}")
Exemple #7
0
    num_actions = 3
    model_dir = "./models_mountaincar"

# initialize/load networks and agent
q = NeuralNetwork(state_dim, num_actions)
q_target = TargetNetwork(state_dim, num_actions)
agent = Agent(q, q_target, num_actions)
agent.load(model_dir)

# run number of episodes
n_test_episodes = 15
episode_rewards = []
for i in range(n_test_episodes):
    stats = run_episode(env,
                        agent,
                        deterministic=True,
                        do_training=False,
                        rendering=True)
    episode_rewards.append(stats.episode_reward)

# save results into a .json file
results = {
    "episode_rewards": episode_rewards,
    "mean": np.array(episode_rewards).mean(),
    "std": np.array(episode_rewards).std(),
}

# create folder
if not os.path.exists("./results"):
    os.mkdir("./results")