def train_embedder(mimicpolicy, masterpolicy, env, scaler, n_episodes=100): # Get parameters from policy seq_len = mimicpolicy.seq_len n_units_list = mimicpolicy.n_units_list for i in range(n_episodes): observes, actions, _, _ = run_episode(env, masterpolicy, scaler, animate=False) state_list = [] for n in n_units_list: st_c = np.zeros((1, n)) st_h = np.zeros((1, n)) state_list.append((st_c, st_h)) state_list = mimicpolicy.run(observes[:seq_len], state_list) ep_mse = 0 for j in range(1, int(len(observes)/seq_len)): mse, state_list = mimicpolicy.train( observes[seq_len * j:seq_len * j + seq_len], state_list) ep_mse += mse print("Episode {}/{}, mse: {}".format(i, n_episodes, ep_mse/int(len(observes)/seq_len)))
def evaluate_mimic(mimicpolicy, masterpolicy, scaler, env, n): total_mse = 0 for i in range(n): observes, actions, _, _ = run_episode(env, masterpolicy, scaler, animate=False) total_mse += mimicpolicy.evaluate(observes, actions) return total_mse / n
def comparemaster(self, env, masterpolicy, scaler): from train import run_episode for i in range(1): observes, actions, rewards, _ = run_episode(env, masterpolicy, scaler, animate=False) for o,a,r in zip(observes, actions, rewards): # act, max_q = self.get_act(obs) act, max_q, min_q = self.get_act_sampling(o) oracle_act_q = self.sess.run(self.Q, {self.obs_ph:np.expand_dims(o, 0), self.act_ph: np.expand_dims(a, 0)})[0][0] print("Oracle act,q : {},{}. argmax(Q),max(Q): {},{}".format(a, oracle_act_q, act, max_q))
def train_mimic(mimicpolicy, masterpolicy, env, scaler, n_episodes=100, batchsize=32): for i in range(n_episodes): observes, actions, _, _ = run_episode(env, masterpolicy, scaler, animate=False) mse = mimicpolicy.train(observes, actions, batchsize) print("Episode {}/{}, mse: {}".format(i, n_episodes, mse))
def train_qnet(qnet, masterpolicy, env, scaler, n_episodes=100, batchsize=32): for i in range(n_episodes): observes, actions, rewards, _ = run_episode(env, masterpolicy, scaler, animate=False) if len(observes) <= 2*batchsize: continue mse = qnet.train(observes, actions, rewards, batchsize) print("Episode {}/{}, MSE: {}".format(i, n_episodes, mse))
def rollout(argv): parser = argparse.ArgumentParser() parser.add_argument("--load-model", required=True, help="Model weights") args = parser.parse_args(argv) gui = TetrisGUI() env, _ = train.make_env(gui, record=False) state_shape = env.observation_space.shape action_size = env.action_space.n agent = train.NNAgent(state_shape, action_size) agent.load_model(args.load_model) while True: rew, steps = train.run_episode(env, agent, True, None, 100000) print(f"reward: {rew} steps: {steps}")
num_actions = 3 model_dir = "./models_mountaincar" # initialize/load networks and agent q = NeuralNetwork(state_dim, num_actions) q_target = TargetNetwork(state_dim, num_actions) agent = Agent(q, q_target, num_actions) agent.load(model_dir) # run number of episodes n_test_episodes = 15 episode_rewards = [] for i in range(n_test_episodes): stats = run_episode(env, agent, deterministic=True, do_training=False, rendering=True) episode_rewards.append(stats.episode_reward) # save results into a .json file results = { "episode_rewards": episode_rewards, "mean": np.array(episode_rewards).mean(), "std": np.array(episode_rewards).std(), } # create folder if not os.path.exists("./results"): os.mkdir("./results")