from tf_rl.common.memory import ReplayBuffer from tf_rl.common.wrappers import wrap_deepmind, make_atari size = 100000 env = wrap_deepmind(make_atari("PongNoFrameskip-v4")) memory = ReplayBuffer(size=size, traj_dir="./traj/") state = env.reset() action = env.action_space.sample() next_state, reward, done, info = env.step(action) env.close() for _ in range(size): memory.add(state, action, reward, next_state, done) print(len(memory)) memory.save() del memory memory = ReplayBuffer(size=size, recover_data=True, traj_dir="./traj/") print(len(memory))
for i in itertools.count(): state = env.reset() total_reward = 0 start = time.time() agent.random_process.reset_states() done = False episode_len = 0 while not done: # env.render() if global_timestep.numpy() < agent.params.learning_start: action = env.action_space.sample() else: action = agent.predict(state) # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) next_state, reward, done, info = env.step(action * env.action_space.high) replay_buffer.add(state, action, reward, next_state, done) global_timestep.assign_add(1) episode_len += 1 total_reward += reward state = next_state # for evaluation purpose if global_timestep.numpy() % agent.params.eval_interval == 0: agent.eval_flg = True """ ===== After 1 Episode is Done ===== """ # train the model at this point
expert_action = expert.main_model(states) expert_action = tf.argmax(expert_action, axis=-1) return expert_action for epoch in range(300): state = env.reset() done = False reward_ep = 0 while not done: if epoch <= 1: action = env.action_space.sample() else: action = agent.select_action(state=state) action = np.squeeze(action).astype(np.int8) next_state, reward, done, info = env.step(action) buffer.add(state, action, reward, next_state, done) state = next_state reward_ep += reward reward_total.append(reward_ep) losses = list() for grad_step in range(10): states, _, _, _, _ = buffer.sample(batch_size=32) expert_action = ask_expert(states) loss = agent.update(states, expert_action) losses.append(loss.numpy()) print("Ep: {} Reward: {} MAR: {:.4f} Loss: {:.4f}".format( epoch, reward_ep, np.mean(reward_total), np.mean(losses))) env.close()