class Athlete(object): def __init__(self, environment_name="CartPole-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): self.environment = gym.make(environment_name) state = self.environment.reset() self.state_shape = state.shape self.action_space = self.environment.action_space.n self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) self.model = self.build_network() self.target_model = self.build_network() self.action_threshold = action_threshold self.batch_size = batch_size self.gamma = gamma def build_network(self) -> tf.keras.Model: yield NotImplemented() def choose_action(self, state: np.ndarray, threshold: float): if random.random() > threshold: # 随机取结果 action = random.randint(0, self.action_space - 1) else: # 模型取结果 results = self.model.predict(state.reshape([1] + list(state.shape))) action = np.argmax(results, 1)[0] return action def simulate(self, action_threshold: float): state = self.environment.reset() while not self.replay_memory.is_full: action = self.choose_action(state, action_threshold) state_after, reward, done, _ = self.environment.step(action) self.replay_memory.add(state, action, reward, done, state_after) state = state_after if done: state = self.environment.reset() return True def train(self, epoch=100, model_prefix="saved_models/model"): model_prefix = model_prefix + ".epoch_{}.score_{}.h5" self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01), loss=tf.losses.mean_squared_error) for i in range(epoch): print("Epoch {} running:...".format(i)) self.target_model.set_weights(self.model.get_weights()) self.replay_memory.reset() self.simulate(self.action_threshold) self.replay_memory.compute_estimated_q(self.target_model, self.gamma) num_batches = self.replay_memory.length / self.batch_size for j in range(int(num_batches)): states, actions, rewards, dones, next_states, estimated_q = self.replay_memory.random_batch( self.batch_size) self.model.fit(states, estimated_q, epochs=1, verbose=0) if i % 5 == 0: score = self.estimate_model(self.model, render=False) model_path = model_prefix.format(i, score) print("Saving model: {} ...".format(model_path)) self.model.save(model_prefix.format(i, score)) def estimate_model(self, model=None, model_path="", render=True): if not model: model: tf.keras.Model = tf.keras.models.load_model(model_path) state = self.environment.reset() reward_count = 0 while True: action = model.predict( state.reshape([ 1, ] + list(self.state_shape))) print(state) print(action) action = np.argmax(action, 1)[0] print(action) if render: time.sleep(0.05) self.environment.render() state_after, revard, done, _ = self.environment.step(action) reward_count += revard if done: break state = state_after print("Steps taken: ", reward_count) return reward_count def score_model(self, model=None, model_path="", num_iteration=10): if not model: model: tf.keras.Model = tf.keras.models.load_model(model_path) scores = [] for i in range(num_iteration): score = self.estimate_model(model) scores.append(score) avg_score = sum(scores) / num_iteration return avg_score
class MotionAthlete(Athlete): def __init__(self, environment_name="Acrobot-v1", replay_memory_size=10000, action_threshold=0.7, batch_size=64, gamma=0.9): super(MotionAthlete, self).__init__(environment_name, replay_memory_size, action_threshold, batch_size, gamma) self.environment.close() del self.environment self.environment = EnvironmentWrapper(environment_name) frame = self.environment.reset() frmae_shape = frame.shape self.motion_tracer = MotionTracer(frame_shape=frmae_shape) self.state_shape = self.motion_tracer.state_shape self.replay_memory = ReplayMemory(self.state_shape, capacity=replay_memory_size) del self.model del self.target_model self.model = self.build_network() self.target_model = self.build_network() def simulate(self, action_threshold: float): print("Simulating...") frame = self.environment.reset() self.motion_tracer.reset() self.motion_tracer.add_frame(frame) while not self.replay_memory.is_full: state = self.motion_tracer.get_state() action = self.choose_action(state, action_threshold) frame_after, reward, done, _ = self.environment.step(action) self.motion_tracer.add_frame(frame_after) state_next = self.motion_tracer.get_state() self.replay_memory.add(state, action, reward, done, state_next) if done: frame = self.environment.reset() self.motion_tracer.reset() self.motion_tracer.add_frame(frame) print("Simulation finished") return True def estimate_model(self, model=None, model_path="", render=True): if not model: model: tf.keras.Model = tf.keras.models.load_model(model_path) frame = self.environment.reset() self.motion_tracer.reset() self.motion_tracer.add_frame(frame) state = self.motion_tracer.get_state() reward_count = 0 step_count = 0 while True: step_count += 1 action = model.predict( state.reshape([ 1, ] + list(self.state_shape))) print(frame) print(action) action = np.argmax(action, 1)[0] print(action) if render: time.sleep(0.05) self.environment.render() frame_after, revard, done, _ = self.environment.step(action) reward_count += revard if done: break self.motion_tracer.add_frame(frame_after) state = self.motion_tracer.get_state() print("Total reward: ", reward_count) print("Total step: ", step_count) return reward_count
def train_model(env, conv_layers, learning_rate=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=100000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, double_dqn=False, **network_kwargs) -> tf.keras.Model: """Train a DQN model. Parameters ------- env: gym.Env openai gym conv_layers: list a list of triples that defines the conv network learning_rate: float learning rate for adam optimizer total_timesteps: int number of env steps to run the environment buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every train_freq steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to store a checkpoint during training checkpoint_path: str the fs path for storing the checkpoints learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. double_dqn: bool specifies if double q-learning is used during training Returns ------- dqn: an instance of tf.Module that contains the trained model """ q_func = build_dueling_q_func(conv_layers, **network_kwargs) dqn = DeepQ(model_builder=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, learning_rate=learning_rate, gamma=gamma, double_dqn=double_dqn) manager = None if checkpoint_path is not None: load_path = osp.expanduser(checkpoint_path) ckpt = tf.train.Checkpoint(model=dqn.q_network) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5) ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/gradient_tape/' + current_time + '/train' train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Create the replay buffer replay_buffer = ReplayMemory(buffer_size) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(total_timesteps=int(exploration_fraction * total_timesteps), initial_prob=1.0, final_prob=exploration_final_eps) dqn.update_target() episode_rewards = [0.0] obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) for t in range(total_timesteps): update_eps = exploration.step_to(t) action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps) action = action[0].numpy() new_obs, reward, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) replay_buffer.add(obs[0], action, reward, new_obs[0], float(done)) obs = new_obs episode_rewards[-1] += reward if done: obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, _ = tf.ones_like(rewards), None td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network every target_network_update_freq steps dqn.update_target() reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1) number_episodes = len(episode_rewards) - 1 if done and print_freq is not None and number_episodes % print_freq == 0: format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}" print( format_str.format(t, number_episodes, reward_100_mean, episode_rewards[-2], int(100 * exploration.value(t)))) with train_summary_writer.as_default(): tf.summary.scalar('loss', dqn.train_loss_metrics.result(), step=t) tf.summary.scalar('reward', episode_rewards[-2], step=t) if checkpoint_path is not None and t % checkpoint_freq == 0: manager.save() # Every training step, reset the loss metric dqn.train_loss_metrics.reset_states() return dqn.q_network