def test_observation_zeroing(self): """ Tests zeroing out of frames not from current episode """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for terminal_idx in range(5): obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i terminal = 1 if i == terminal_idx else 0 er.append(partial_obs, 0, 0, terminal) if i <= terminal_idx: partial_obs *= 0 if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def __init__(self): self.eps = 0.1 self.env = GridEnv(3) self.batch_size = 20 if prioritized_replay and replay_type == "proportional": self.replay = ProportionalReplay(max_buffer_size, prioritized_replay_alpha) elif prioritized_replay and replay_type == "ranked": N_list = [self.batch_size] + [ int(x) for x in np.linspace(100, max_buffer_size, 5) ] save_quantiles(N_list=N_list, k=self.batch_size, alpha=prioritized_replay_alpha) self.replay = RankBasedReplay(max_buffer_size, prioritized_replay_alpha) else: self.replay = ExperienceReplay( max_buffer_size) # passing size of buffer # define graph self.inputs = tf.placeholder(tf.float32, shape=(None, self.env.state_size)) self.target_values = tf.placeholder(tf.float32, shape=(None, )) self.actions = tf.placeholder(tf.int32, shape=(None, )) self.is_weights = tf.placeholder(tf.float32, shape=( None, )) # importance sampling weights for prioritized replay self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph( ) # build main network self.target_Q_out_op, _, _ = self.build_graph( 'target') # build identical target network self.init_op = tf.global_variables_initializer() self.sess = tf.Session()
def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay()
def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, saveAndLoad=True): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model_a = RLModel() self.model_a.build((None, AGENT_INPUT_SIZE)) self.model_b = RLModel() self.model_b.build((None, AGENT_INPUT_SIZE)) self.saveAndLoad = saveAndLoad if os.path.isfile(SAVE_PATH_A) and os.path.isfile( SAVE_PATH_B) and saveAndLoad: print("Loading") self.model_a.load_weights(SAVE_PATH_A) self.model_b.load_weights(SAVE_PATH_B) self.exp_rep_a = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.exp_rep_b = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0)
def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, load_path=None): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model = RLModel() self.model.build((None, AGENT_INPUT_SIZE)) self.load_path = load_path if load_path is not None and os.path.isfile(load_path): print("Loading") self.model.load_weights(load_path) self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0)
def __init__(self): self.experience_replay = ExperienceReplay('BreakoutDeterministic-v0', FLAGS.replay_buffer_size, 84, 84, 4, self.policy, FLAGS.decay_to_epoch) config = DQNConfig() config.learning_rate = FLAGS.learning_rate config.gamma = FLAGS.gamma config.decay = FLAGS.decay config.momentum = FLAGS.momentum config.eps = FLAGS.eps config.input_width = FLAGS.image_width config.input_height = FLAGS.image_height config.skip = FLAGS.skip self.dqn = DQN(config, FLAGS.use_huber) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) logger.info('initializing variables...') self.sess.run(tf.global_variables_initializer()) self.update_target() self.epoch = 0 self.decay_epsilon()
def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None): self.exp_history = ExperienceReplay(num_frame_stack, capacity=experience_capacity, pic_size=pic_size) self.playing_cache = ExperienceReplay(num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size) self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack, ) + self.pic_size self.global_counter = 0 self.episode_counter = 0 if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.q_values = [] self.loss_his = []
def test_sampling(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for i in range(1, 6): partial_obs = np.ones(obs_shape) * i er.append(partial_obs, 1, 1, 0) batch = er.sample(1) _, rewards, actions, _, terminals = batch assert np.array_equal(rewards, np.array([1])) assert np.array_equal(actions, np.array([1])) assert np.array_equal(terminals, np.array([0]))
def __init__(self): self.prob_random = 1.0 # Probability to play random action self.y = .99 # Discount factor self.batch_size = 64 # How many experiences to use for each training step self.prob_random_end = .01 # Ending chance of random action self.prob_random_decay = .996 # Decrease decay of the prob random self.max_episode = 300 # Max number of episodes you are allowes to played to train the game self.expected_goal = 200 # Expected goal self.dnn = DNN() self.env = gym.make('CartPole-v0') self.memory = ExperienceReplay(buffer_size=10000) self.metadata = [ ] # we will store here info score, at the end of each episode
def __init__(self): # gamma is a parameter of Q - learing algorithm self.gamma = 0.9 # We use epsilon - greedy strategy of learning self.epsilon = 1 self.epsilon_decay = 0.99 self.epsilon_min = 0.01 # Number of epochs (fully played games) to study an agent self.epochs = 500 # Game to play self.game = Game() # Number of hidden layer nodes self.hidden_layer_nodes = 20 # Create keras model # _________________________________________________________________ # Layer (type) Output Shape Param # # ================================================================= # dense_1 (Dense) (None, 20) 120 # _________________________________________________________________ # dense_2 (Dense) (None, 20) 420 # _________________________________________________________________ # dense_3 (Dense) (None, 5) 105 # ================================================================= # Total params: 645 # Trainable params: 645 # Non-trainable params: 0 # _________________________________________________________________ self.model = Sequential() self.model.add(Dense(self.hidden_layer_nodes, input_dim=self.game.state_size, activation='relu')) self.model.add(Dense(self.hidden_layer_nodes, activation='relu')) self.model.add(Dense(len(POSSIBLE_ACTIONS), activation='linear')) self.model.compile('Adam', loss='mse') # Initialize experience replay self.experience_replay = ExperienceReplay(size=2000) self.batch_size = 20 self.max_turns = 100
def test_observation_construction(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) er.append(partial_obs, 0, 0, 0) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def __init__(self, env, net_update_rate: int = 25, exploration_rate: float = 1.0, exploration_decay: float = 0.00005): # set hyper parameters self.exploration_rate = exploration_rate self.exploration_decay = exploration_decay self.net_updating_rate = net_update_rate # set environment self.env = env self.state_shape = env.get_state_shape() self.action_shape = env.get_action_shape() # the number of experience per batch for batch learning # Experience Replay for batch learning self.exp_rep = ExperienceReplay() # Deep Q Network self.net = None
def init(): train_env = SquigglesEnvironment(num_notes=2) evaluation_env = SquigglesEnvironment(num_notes=2) train_env = tf_py_environment.TFPyEnvironment(train_env) evaluation_env = tf_py_environment.TFPyEnvironment(evaluation_env) agent, _ = generic_dqn_agent(train_env) experience_replay = ExperienceReplay(agent, train_env, BATCH_SIZE) return agent, train_env, evaluation_env, experience_replay
def __init__(self, FLAGS): """ This class build the model that implements the deterministic gradient descent algorithm. :param FLAGS: TensorFlow flags which contain the values for hyperparameters """ self.FLAGS=FLAGS self.env = gym.make('Pendulum-v0') self.state_size = len(self.env.observation_space.sample()) self.num_episodes=1000 self.batch_size=64 self.exp_replay=ExperienceReplay(50000,1500, FLAGS) self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None) self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS) self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS) self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS) self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS) init = tf.global_variables_initializer() self.session = tf.InteractiveSession() self.session.run(init) self.critic.set_session(self.session) self.actor.set_session(self.session) self.actor_target.set_session(self.session) self.critic_target.set_session(self.session) self.critic.init_target_network() self.actor.init_target_network()
def __init__(self, s_size, a_size, seed): """ Parameters: s_size (int): dimension of each state a_size (int): dimension of each action seed (int): random seed """ self.s_size = s_size self.a_size = a_size self.seed = random.seed(seed) # Initialize both the Q-networks self.local_dqn = Model(s_size, a_size, seed).to(device) self.target_dqn = Model(s_size, a_size, seed).to(device) self.optimizer = optim.Adam(self.local_dqn.parameters(), lr=c.LEARNING_RATE) # Initialize experience deque self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE, c.BATCH_SIZE, seed) # Time step counter used for updating as per UPDATE_FREQUENCY self.t_step = 0
def run_episode(plan_step_fn, learner, dataset, cache_subtree, add_returns, preproc_obs_fn=None, render=False): episode_done = False actor.reset() episode_rewards = [] aux_replay = ExperienceReplay( ) # New auxiliary buffer to save current episode transitions while not episode_done: # Planning step tree_policy = plan_step_fn(len(episode_rewards)) # Execute action (choose one node as the new root from depth 1) a = sample_pmf(tree_policy) prev_root_data, current_root_data = actor.step(a, cache_subtree, render, render_size=(512, 512)) aux_replay.append({ "observations": prev_root_data["obs"], "target_policy": tree_policy }) episode_rewards.append(current_root_data["r"]) episode_done = current_root_data["done"] # Learning step if learner is not None: batch = dataset.sample(batch_size) if preproc_obs_fn is not None: batch["observations"] = preproc_obs_fn(batch["observations"]) obs = tf.constant(batch["observations"], dtype=tf.float32) target_policy = tf.constant(batch["target_policy"], dtype=tf.float32) if add_returns: returns = tf.constant(batch["returns"], dtype=tf.float32) loss, _ = learner.train_step(obs, target_policy, returns) else: loss, _ = learner.train_step(obs, target_policy) # Add episode to the dataset if add_returns: returns = compute_returns(episode_rewards, discount_factor) # Backpropagate rewards aux_replay.add_column("returns", returns) # Add them to the dataset dataset.extend( aux_replay ) # Add transitions to the buffer that will be used for learning return episode_rewards
def main(): hist_length = 50 processor = Processor(history_length=hist_length) price_history = processor.fetchData() train_price_history = price_history['train'] test_price_history = price_history['test'] env = Environment(horizon=20, train_price_history=train_price_history, test_price_history=test_price_history, history_length=hist_length) exp_replay = ExperienceReplay() agent = Agent(feature_size=6, window=hist_length, action_size=3, experience_replay=exp_replay, environment=env) agent.train() print("Agent done training, now testing: ") agent.test(test_price_history)
def __init__(self, game, memory_size = 100000, batch_size = 1, epsilon_init = 1.0, alpha_init = .00025, anneal_alpha = True, anneal_epsilon = True, batch_size_incr = 0): self.memories = ExperienceReplay(memory_size) self.nnet = LeNet(game.state_shape, dim_out = game.n_actions, batch_size = 1, fc_dim = 500, nkerns = [16,32], filter_dims = [2,2], out_type = 'linear') self.trainer = single_batch_trainer(self.nnet) self.game = game self.n_episodes = 0 self.avg_rewards = [] self.avg_action_vals = [] self.alpha = alpha_init self.epsilon = epsilon_init self.anneal_ep = anneal_epsilon self.anneal_lr = anneal_alpha self.batch_size = batch_size self.batch_size_incr = batch_size_incr self._pct_invalids = [] self._costs = []
class DeepQlearner: def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, load_path=None): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model = RLModel() self.model.build((None, AGENT_INPUT_SIZE)) self.load_path = load_path if load_path is not None and os.path.isfile(load_path): print("Loading") self.model.load_weights(load_path) self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0) def getActions(self, agentInputs): rand_action = self.random_action_method.get_random_action() if rand_action is not None: return [rand_action] * agentInputs.shape[0] else: pred = self.model.call(agentInputs) #print(pred[0]) return [ACTIONS[x] for x in np.argmax(pred, axis=1)] def update(self, oldAgentInputs, actions, newAgentInputs, rewards): # Lägg till i experience_replay actions = np.array([ACTIONS.index(action) for action in actions]) #print(["LEFT","RIGHT","JUMP","NONE"][actions[0]],rewards[0]) self.exp_rep.add_experinces(oldAgentInputs, actions, newAgentInputs, rewards) self.n_since_last_train += oldAgentInputs.shape[0] if self.n_since_last_train > TRAIN_RATE: loss = self.train_on_random_minibatch() self.n_since_last_train = 0 def train_on_random_minibatch(self): input, action, new_input, reward = self.exp_rep.get_random_minibatch( BATCH_SIZE) loss = self.train_on_batch(input, action, new_input, reward) #if self.load_path is not None: # self.save(self.load_path) return loss.numpy() def train_on_batch(self, agent_input_before, action, agent_input_after, reward): q_after = self.model(agent_input_after) wanted_q = reward + self.future_discount * tf.reduce_max(q_after, axis=1) #wanted_q = reward tvars = self.model.trainable_variables with tf.GradientTape() as tape: pred_q_for_all_actions = self.model(agent_input_before) # Indexera med rätt actions action_ind = tf.transpose( [tf.range(agent_input_before.shape[0]), action]) pred_q_for_action = tf.gather_nd(pred_q_for_all_actions, action_ind) loss = self.loss_measure(wanted_q, pred_q_for_action) gradients = tape.gradient(loss, tvars) self.opt.apply_gradients(zip(gradients, tvars)) self.latestLoss = loss return loss def save(self, path=SAVE_PATH): self.model.save_weights(path)
class DQN: def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay() def update_target_graph(self): updated_weights = np.array(self.main_qn.model.get_weights()) self.target_qn.model.set_weights(updated_weights) def choose_action(self, state, prob_random, num_episode): if np.random.rand() < prob_random or \ num_episode < self.min_pre_train_episodes: # Act randomly based on prob_random or if we # have not accumulated enough pre_train episodes action = np.random.randint(self.env.actions) else: # Decide what action to take from the Q network # First add one dimension to the netword to fit expected dimension of the network state = np.expand_dims(state, axis=0) action = np.argmax(self.main_qn.model.predict(state)) return action def run_one_episode(self, num_episode, prob_random): # Create an experience replay for the current episode. experiences_episode = [] # Get the game state from the environment state = self.env.reset() done = False # Game is complete cur_step = 0 # Running sum of number of steps taken in episode while cur_step < self.max_num_step and not done: cur_step += 1 action = self.choose_action(state=state, prob_random=prob_random, num_episode=num_episode) # Take the action and retrieve the next state, reward and done next_state, reward, done = self.env.step(action) # Setup the experience to be stored in the episode buffer experience = [state, action, reward, next_state, done] # Store the experience in the episode buffer experiences_episode.append(experience) # Update the state state = next_state return experiences_episode def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # Our predictions (actions to take) from the main Q network target_q = self.main_qn.model.predict(train_state) # Tells us whether game over or not # We will multiply our rewards by this value # to ensure we don't train on the last move train_gameover = train_done == 0 # Q value of the next state based on action target_q_next_state = self.target_qn.model.predict(train_next_state) train_next_state_values = np.max(target_q_next_state[range( self.batch_size)], axis=1) # Reward from the action chosen in the train batch actual_reward = train_reward + (self.y * train_next_state_values * train_gameover) target_q[range(self.batch_size), train_action] = actual_reward return target_q def train_one_step(self): # Train batch is [[state,action,reward,next_state,done],...] train_batch = self.experience_replay.sample(self.batch_size) # Separate the batch into numpy array for each compents train_state = np.array([x[0] for x in train_batch]) train_action = np.array([x[1] for x in train_batch]) train_reward = np.array([x[2] for x in train_batch]) train_next_state = np.array([x[3] for x in train_batch]) train_done = np.array([x[4] for x in train_batch]) # Generate target Q target_q = self.generate_target_q(train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done) # Train the main model loss = self.main_qn.model.train_on_batch(train_state, target_q) return loss def train(self): # Make the networks equal self.update_target_graph() # We'll begin by acting complete randomly. As we gain experience and improve, # we will begin reducing the probability of acting randomly, and instead # take the actions that our Q network suggests prob_random = self.prob_random_start prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps # Init variable num_steps = [] # Tracks number of steps per episode rewards = [] # Tracks rewards per episode print_every = 50 # How often to print status losses = [0] # Tracking training losses num_episode = 0 while True: # Run one episode experiences_episode = self.run_one_episode(num_episode, prob_random) # Save the episode in the replay buffer self.experience_replay.add(experiences_episode) # If we have play enoug episode. Start the training if num_episode > self.min_pre_train_episodes: # Drop the probability of a random action if wi didn't reach the prob_random_end value if prob_random > self.prob_random_end: prob_random -= prob_random_drop # Every train_frequency iteration, train the model if num_episode % self.train_frequency == 0: for num_epoch in range(self.num_epochs): loss = self.train_one_step() losses.append(loss) # Update the target model with values from the main model self.update_target_graph() # Increment the episode num_episode += 1 num_steps.append(len(experiences_episode)) rewards.append(sum([e[2] for e in experiences_episode])) # Print Info if num_episode % print_every == 0: # datetime object containing current date and time now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print( "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}" .format(dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) # Stop Condition if np.mean(rewards[-print_every:]) >= self.goal: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") mean_loss = np.mean(losses[-(print_every * self.num_epochs):]) print( "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}" .format(dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss)) print("Training complete because we reached goal rewards.") break if num_episode > self.max_num_episodes: print("Training Stop because we reached max num of episodes") break
class DQN(object): """ OOP for a Deep Q-Network (DQN). """ def __init__(self, game, memory_size = 100000, batch_size = 1, epsilon_init = 1.0, alpha_init = .00025, anneal_alpha = True, anneal_epsilon = True, batch_size_incr = 0): self.memories = ExperienceReplay(memory_size) self.nnet = LeNet(game.state_shape, dim_out = game.n_actions, batch_size = 1, fc_dim = 500, nkerns = [16,32], filter_dims = [2,2], out_type = 'linear') self.trainer = single_batch_trainer(self.nnet) self.game = game self.n_episodes = 0 self.avg_rewards = [] self.avg_action_vals = [] self.alpha = alpha_init self.epsilon = epsilon_init self.anneal_ep = anneal_epsilon self.anneal_lr = anneal_alpha self.batch_size = batch_size self.batch_size_incr = batch_size_incr self._pct_invalids = [] self._costs = [] def train(self, n_episodes = 3, max_iter = 500): g = self.game g.reset() # set anneal rate for epsilon. ep_anneal_rate = 0 if self.anneal_ep: ep_anneal_rate = float(self.epsilon)/n_episodes alpha_anneal_rate = 0 if self.anneal_lr: alpha_anneal_rate = float(self.alpha)/n_episodes for e_idx in range(n_episodes): s = g.get_state() print "Episode: %d, Exploration Rate: %f, Learning Rate: %f" %(e_idx, self.epsilon, self.alpha) while not g.is_terminal() and not self.game._num_moves >= max_iter and not self.game.iter_ctr >= 200: # epsilon-greedy action selection below if np.random.binomial(1,self.epsilon): a_idx = np.random.randint(self.game.n_actions) else: values = self.nnet.outputter(s.reshape(self.nnet.image_shape)) a_idx = np.argmax(values[0]) r = g.take_action(a_idx) stp1 = g.get_state() # Reshape states into shape expected by convnet. self.memories.insert(Memory( s.transpose(2,0,1).reshape(self.nnet.image_shape), a_idx, r, stp1.transpose(2,0,1).reshape(self.nnet.image_shape) )) s = stp1 # TEST CLOOJ if self.game.iter_ctr %200 == 0: print "move_n: %d, action: %d, reward: %f, status: %d" %( self.game.iter_ctr, a_idx, r, self.game._STATUS ) # Minibatch update. if e_idx > 0: costs = [] # local for this iter. data = self.memories.sample(self.batch_size) # random (state, action, reward, nxt_state) sample from memory replay. data = [m.target_pair(self.nnet) for m in data] # convert above tuple into training data, label pair. for i in range(self.batch_size): d = data[i] costs.append(self.trainer(d[0], d[1], self.alpha)) # call trainer func self._costs.append(np.mean(costs)) # print "Game %d ends in %d iterations with status %d, reward %d." %(e_idx, self.game.iter_ctr, self.game._STATUS, r) # compute percent invalid actions. n_moves = g.iter_ctr rs = g.episode_rewards n_invalid = len(np.where(rs == np.array([-.02 for _ in range(len(rs))]))[0]) pct_invalid = float(n_invalid)/n_moves self._pct_invalids.append(pct_invalid) print "Pct Invalid: %f" %pct_invalid g.reset() self.epsilon -= ep_anneal_rate self.batch_size += self.batch_size_incr if e_idx > 0: self.alpha -= alpha_anneal_rate
def __init__(self, env, obs_size = (115,), num_frame_stack = 1, batch_size = 32, mdp_gamma = 0.95, initial_epsilon = 1.0, min_epsilon = 0.1, epsilon_decay_steps = int(1e6), replay_capacity = int(1e5), min_replay_size = int(1e3), train_freq = 4, network_update_freq = 5000, regularization = 1e-6, optimizer_params = None, render = False): """ Initialization function param env: object. a gym-like environment which our RL agent interacts with parma obs_size: list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation parma num_frame_stack: int. number of stacked frames for network input param batch_size: int. batch size param mdp_gamma: float. MDP discount factor param initial_epsilon: float. epsilon parameter of epsilon-greedy policy param min_epsilon: float. minimum epsilon parameter of epsilon-greedy policy param epsilon_decay_steps: int. how many steps to decay epsilon param replay_capacity: int. replay buffer size param min_replay_size: int. minimum replay buffer size param train_freq: int. training frequency param network_update_freq: int. network update frequency param regularization: float. regularization coefficient param optimizer_params: dict. optimizer specilized parameters. i.e. learning rate, momentum param render: bool. is render mode on? """ # experience replay buffer for training self.exp_buffer = ExperienceReplay( num_frame_stack, capacity=replay_capacity, obs_size = obs_size ) # experience replay buffer for playing/testing self.play_buffer = ExperienceReplay( num_frame_stack, capacity=num_frame_stack * 10, obs_size = obs_size ) self.env = env self.obs_size = obs_size self.num_frame_stack = num_frame_stack self.batch_size = batch_size self.mdp_gamma = mdp_gamma self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.replay_capacity = replay_capacity self.min_replay_size = min_replay_size self.train_freq = train_freq self.network_update_freq = network_update_freq self.regularization = regularization self.render = render self.dim_actions = env.action_space.n self.dim_state = (num_frame_stack,) + self.obs_size if optimizer_params: self.optimizer_params = optimizer_params else: self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7) self.is_training = True # epsilon used for playing # if 0, means that we just use the Q-network's optimal action without any exploration self.playing_epsilon = 0.0 self.session = None self.global_counter = 0 self.episode_counter = 0 self.loss_history = []
class DQN: """ DQN implementation. Note that only supports an environment that is gym-like.(i.e. reset, step, ..) """ def __init__(self, env, obs_size = (115,), num_frame_stack = 1, batch_size = 32, mdp_gamma = 0.95, initial_epsilon = 1.0, min_epsilon = 0.1, epsilon_decay_steps = int(1e6), replay_capacity = int(1e5), min_replay_size = int(1e3), train_freq = 4, network_update_freq = 5000, regularization = 1e-6, optimizer_params = None, render = False): """ Initialization function param env: object. a gym-like environment which our RL agent interacts with parma obs_size: list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation parma num_frame_stack: int. number of stacked frames for network input param batch_size: int. batch size param mdp_gamma: float. MDP discount factor param initial_epsilon: float. epsilon parameter of epsilon-greedy policy param min_epsilon: float. minimum epsilon parameter of epsilon-greedy policy param epsilon_decay_steps: int. how many steps to decay epsilon param replay_capacity: int. replay buffer size param min_replay_size: int. minimum replay buffer size param train_freq: int. training frequency param network_update_freq: int. network update frequency param regularization: float. regularization coefficient param optimizer_params: dict. optimizer specilized parameters. i.e. learning rate, momentum param render: bool. is render mode on? """ # experience replay buffer for training self.exp_buffer = ExperienceReplay( num_frame_stack, capacity=replay_capacity, obs_size = obs_size ) # experience replay buffer for playing/testing self.play_buffer = ExperienceReplay( num_frame_stack, capacity=num_frame_stack * 10, obs_size = obs_size ) self.env = env self.obs_size = obs_size self.num_frame_stack = num_frame_stack self.batch_size = batch_size self.mdp_gamma = mdp_gamma self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.replay_capacity = replay_capacity self.min_replay_size = min_replay_size self.train_freq = train_freq self.network_update_freq = network_update_freq self.regularization = regularization self.render = render self.dim_actions = env.action_space.n self.dim_state = (num_frame_stack,) + self.obs_size if optimizer_params: self.optimizer_params = optimizer_params else: self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7) self.is_training = True # epsilon used for playing # if 0, means that we just use the Q-network's optimal action without any exploration self.playing_epsilon = 0.0 self.session = None self.global_counter = 0 self.episode_counter = 0 self.loss_history = [] def get_variables(self,scope): """ Get variables according to scope name """ vars_list = [] for var in tf.global_variables(): if "%s/" % scope in var.name and "Adam" not in var.name: vars_list.append(var) return sorted(vars_list, key=lambda x: x.name) def get_epsilon(self): """ Get current epsilon value. Note: with the training process, epsilon is decaying """ if self.is_training == False: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: # for simplicity, just use linear decay return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * (1.0 - self.global_counter / float(self.epsilon_decay_steps)) def network(self, input, trainable, use_image = False): """ Implementation of Q(s,a) network param input: tensor. [Batch_Size, N_State] or [Batch_Size, Num_stack_frame, H, W] """ regularizer = None if trainable: regularizer = slim.l2_regularizer(self.regularization) if not use_image: # here use vanilla 4-layer perceptron # 1st layer net = slim.fully_connected(input, 512, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) # 2nd layer net = slim.fully_connected(net, 1024, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) # 3rd layer net = slim.fully_connected(net,512, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) # 4th layer #net = slim.fully_connected(net, 256, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) # output layer q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn = None, weights_regularizer = regularizer, trainable = trainable) else: x = tf.transpose(input, [0,2,3,1]) net = slim.conv2d(x, 8, (7,7), stride = 3, data_format = "NHWC", activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d(net, 16, (3,3), stride = 1, data_format = "NHWC", activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) net = slim.fully_connected(net, 256, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable) q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn = None, weights_regularizer = regularizer, trainable = trainable) return q_state_action_values def sample_random_action(self): """ Randomly sample an action for rollout """ return np.random.choice(self.dim_actions) def setup_graph(self, use_image = False, if_soft = True): """ Set up tensorflow computing graph """ # define a bunch of placeholders if use_image: input_next_state_shape = (self.batch_size, self.num_frame_stack) + self.obs_size input_prev_state_shape = (None, self.num_frame_stack) + self.obs_size else: input_next_state_shape = (self.batch_size, self.obs_size[0]) input_prev_state_shape = (None, self.obs_size[0]) self.input_prev_state = tf.placeholder(tf.float32, input_prev_state_shape, name = "input_prev_state") self.input_next_state = tf.placeholder(tf.float32, input_next_state_shape, name = "input_next_state") self.input_actions = tf.placeholder(tf.int32, self.batch_size, name = "input_actions") self.input_reward = tf.placeholder(tf.float32, self.batch_size, name = "input_reward") self.is_done = tf.placeholder(tf.int32, self.batch_size, name = "is_done") self.optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) """ Q-learning: 1. take action a_t according to epsilon-greedy policy 2. store transition (s_t, a_t, r_t+1, s_t+1) in replay buffer D 3. sample random mini-batch of transitions (s,a,r,s') from D 3. compute Q-learning targets w.r.t. old, fixed parameters w- 4. optimise MSE between Q-network and Q-learning targets L(w) = E{s,a,r,s' ~ D} [(r + \gamma \max_a' Q(s',a',w-) - Q(s,a,w))^2] 5. use variant of stochastic gradient descent """ # Note: the following 2 networks need to have the same structure # fixed, old parameters Q-network for Q-target estimation with tf.variable_scope("target_q"): q_target = self.network(self.input_next_state, trainable=False, use_image = use_image) # trainable, new parameters Q-network for Q-learning with tf.variable_scope("update_q"): q_estimate = self.network(self.input_prev_state, trainable=True, use_image = use_image) # optimal action recovered by newest Q-network self.optimal_action = tf.argmax(q_estimate, axis = 1) not_done = tf.cast(tf.logical_not(tf.cast(self.is_done, "bool")), tf.float32) q_target_value = self.input_reward + not_done * self.mdp_gamma * tf.reduce_max(q_target, -1) # choose chosen self.input_actions from q_estimate to get values # first get indexes idx = tf.stack([tf.range(0, self.batch_size), self.input_actions], axis = 1) q_estimate_value = tf.gather_nd(q_estimate, idx) # MSE loss mse_loss = tf.nn.l2_loss(q_estimate_value - q_target_value) / self.batch_size # Regularization loss regularization_loss = tf.add_n(tf.losses.get_regularization_losses()) self.loss = mse_loss + regularization_loss self.train_op = self.optimizer.minimize(self.loss) update_params = self.get_variables("update_q") target_params = self.get_variables("target_q") assert (len(update_params) == len(target_params)) # weights copy op if if_soft: self.assign_op = [tf.assign(tp,0.001 * up + 0.999 * tp) for tp, up in zip(target_params, update_params)] else: self.assign_op = [tf.assign(tp,up) for tp, up in zip(target_params, update_params)] def train(self): """ train step """ # sample one mini-batch to compute mse batch = self.exp_buffer.sample_mini_batch(self.batch_size) if self.num_frame_stack > 1: # suppose use image observation feed_dict = { self.input_prev_state : batch["prev_state"], self.input_next_state : batch["next_state"], self.input_actions: batch["actions"], self.is_done: batch["done_mask"], self.input_reward: batch["reward"] } else: # reduce the axis 1 feed_dict = { self.input_prev_state : batch["prev_state"][:,0,:], self.input_next_state : batch["next_state"][:,0,:], self.input_actions: batch["actions"], self.is_done: batch["done_mask"], self.input_reward: batch["reward"] } _, loss = self.session.run([self.train_op, self.loss], feed_dict=feed_dict) self.loss_history.append(loss) return loss def update_target_network(self): """ Update target network """ # no need for feed dicts self.session.run(self.assign_op) def play_episode(self): if self.is_training: rb = self.exp_buffer else: rb = self.play_buffer # total reward sum_reward = 0 # total loss sum_loss = 0 # steps steps_in_episode = 0 first_obs = self.env.reset() rb.new_episode(first_obs) while True: if np.random.rand() > self.get_epsilon(): if self.num_frame_stack > 1: action = self.session.run(self.optimal_action, {self.input_prev_state: rb.current_state()[np.newaxis,:]})[0] else: action = self.session.run(self.optimal_action, {self.input_prev_state: rb.current_state()})[0] else: action = self.sample_random_action() obs, reward, done, info = self.env.step(action) if self.render: self.env.render() else: pass sum_reward += reward steps_in_episode += 1 # add one experience into buffer rb.add_experience(obs, action, done, reward) if self.is_training: self.global_counter += 1 if self.global_counter % self.network_update_freq == 0: self.update_target_network() if self.exp_buffer.counter >= self.min_replay_size and self.global_counter % self.train_freq == 0: sum_loss += self.train() if done: if self.is_training: self.episode_counter += 1 return sum_reward, steps_in_episode, sum_loss / float(steps_in_episode)
class DQN: """ Implementation of deep q learning algorithm """ def __init__(self): self.prob_random = 1.0 # Probability to play random action self.y = .99 # Discount factor self.batch_size = 64 # How many experiences to use for each training step self.prob_random_end = .01 # Ending chance of random action self.prob_random_decay = .996 # Decrease decay of the prob random self.max_episode = 300 # Max number of episodes you are allowes to played to train the game self.expected_goal = 200 # Expected goal self.dnn = DNN() self.env = gym.make('CartPole-v0') self.memory = ExperienceReplay(buffer_size=10000) self.metadata = [ ] # we will store here info score, at the end of each episode def choose_action(self, state, prob_random): if np.random.rand() <= prob_random: action = np.random.randint(self.env.action_space.n) else: action = np.argmax(self.dnn.model.predict(state)) return action def run_one_step(self, state): action = self.choose_action(state, self.prob_random) next_state, reward, done, _ = self.env.step(action) next_state = np.expand_dims(next_state, axis=0) return state, action, reward, next_state, done def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done): # Our predictions (actions to take) from the main Q network target_q = self.dnn.model.predict(train_state) # Tells us whether game over or not # We will multiply our rewards by this value # to ensure we don't train on the last move train_gameover = train_done == 0 # Q value of the next state based on action target_q_next_state = self.dnn.model.predict(train_next_state) train_next_state_values = np.max(target_q_next_state[range( self.batch_size)], axis=1) # Reward from the action chosen in the train batch actual_reward = train_reward + (self.y * train_next_state_values * train_gameover) target_q[range(self.batch_size), train_action] = actual_reward return target_q def train_one_step(self): batch_data = self.memory.sample(self.batch_size) train_state = np.array([i[0] for i in batch_data]) train_action = np.array([i[1] for i in batch_data]) train_reward = np.array([i[2] for i in batch_data]) train_next_state = np.array([i[3] for i in batch_data]) train_done = np.array([i[4] for i in batch_data]) # These lines remove useless dimension of the matrix train_state = np.squeeze(train_state) train_next_state = np.squeeze(train_next_state) # Generate target Q target_q = self.generate_target_q(train_state=train_state, train_action=train_action, train_reward=train_reward, train_next_state=train_next_state, train_done=train_done) loss = self.dnn.model.train_on_batch(train_state, target_q) return loss def train(self): scores = [] for e in range(self.max_episode): # Init New episode state = self.env.reset() state = np.expand_dims(state, axis=0) episode_score = 0 while True: state, action, reward, next_state, done = self.run_one_step( state) self.memory.add( experiences=[[state, action, reward, next_state, done]]) episode_score += reward state = next_state if len(self.memory.buffer) > self.batch_size: self.train_one_step() if self.prob_random > self.prob_random_end: self.prob_random *= self.prob_random_decay if done: now = datetime.now() dt_string = now.strftime("%d/%m/%Y %H:%M:%S") self.metadata.append( [now, e, episode_score, self.prob_random]) print( "{} - episode: {}/{}, score: {:.1f} - prob_random {:.3f}" .format(dt_string, e, self.max_episode, episode_score, self.prob_random)) break scores.append(episode_score) # Average score of last 100 episode means_last_10_scores = np.mean(scores[-10:]) if means_last_10_scores == self.expected_goal: print('\n Task Completed! \n') break print("Average over last 10 episode: {0:.2f} \n".format( means_last_10_scores)) print("Maximum number of episode played: %d" % self.max_episode)
from training_testing import test # parameters epsilon = 0.1 # exploration max_memory = 500 # Maximum number of experiences we are storing hidden_size = 100 # Size of the hidden layers batch_size = 1 # Number of experiences we use for training per batch epoch = 50 def baseline_model(grid_size, num_actions, hidden_size): # seting up the model with keras model = Sequential() model.add( Dense(hidden_size, input_shape=(grid_size**2, ), activation='relu')) model.add(Dense(hidden_size, activation='relu')) model.add(Dense(num_actions)) model.compile(SGD(lr=.1), "mse") return model # Define environment/game env = Catch() # Initialize experience replay object exp_replay = ExperienceReplay(max_memory=max_memory) model = baseline_model(grid_size, num_actions, hidden_size) train(env, model, exp_replay, epoch, epsilon, num_actions, batch_size) test(model)
downsampling_pix_values=None, atari_frameskip=args.atari_frameskip) eval_fn = get_evaluate_fn(env_eval=env_eval, preproc_obs_fn=preproc_obs_fn, policy_NN=call_model, args=args) process = psutil.Process() memory_usage_fn = lambda: process.memory_info().rss stats = Stats(use_tensorboard=args.use_tensorboard, log_path=log_path) experience_keys = ["observations", "target_policy"] if args.compute_value: experience_keys.append("returns") experience_replay = ExperienceReplay(keys=experience_keys, capacity=args.replay_capacity) run_episode_fn = get_episode_fn( actor=high_level_actor if args.hierarchical else low_level_actor, planner=high_level_planner if args.hierarchical else low_level_planner, train_fn=train_fn, dataset=experience_replay, add_returns=args.compute_value, stats=stats, memory_usage_fn=memory_usage_fn, preproc_obs_fn=preproc_obs_fn, eval_fn=eval_fn, n_actions=env.action_space.n, value_scalars_to_distrs=value_scalars_to_distrs, value_logits_to_scalars=value_logits_to_scalars, args=args)
gameDisplay = pygame.display.set_mode(DISPLAY_SHAPE) pygame.display.set_caption('Bouncing Balls') pygame.key.set_repeat(1, 1) env = GameEnvironment(DISPLAY_SHAPE, 1.0 / float(FPS)) def action_vector(a): res = np.zeros(9) res[int(a)] = 1.0 return res # Define Experience Replay if SAVE_EXPERIENCE: er = ExperienceReplay.load(EXP_REPLAY_FILE) if er == None: er = ExperienceReplay(BUFFER_SIZE) def gameover(hero_score): gameDisplay.fill(WHITE) font = pygame.font.SysFont(None, 42) text = font.render("GAME OVER", True, BLACK) gameDisplay.blit(text, (DISPLAY_SHAPE[0] / 3, DISPLAY_SHAPE[1] / 3)) pygame.display.update() pygame.time.delay(3000)
from experience_replay import ExperienceReplay from logger import Logger ACTIONS = {0: "UP", 1: "DOWN", 2: "RIGHT", 3: "LEFT"} NUM_ACTIONS = len(ACTIONS) NUM_GAMES = 30000 OBSERVE = 1000 MAX_TILE = 2048 epsilon = 0.1 min_epsilon = 1e-2 gamma_epsilon = 0.999 gamma_reward = 0.99 replay = ExperienceReplay(capacity=1e6) logger = Logger() online = PolicyNetwork(batch_size=32) target = PolicyNetwork(batch_size=32) def preprocess(a: np.array) -> np.array: a = np.where(a <= 0, 1, a) a = np.log2(a) / np.log2(MAX_TILE) return a if __name__ == "__main__": best_score = 0
from __future__ import division, print_function import gym import gym_gazebo import numpy as np import sys import os from ddq_model import Qnet from experience_replay import ExperienceReplay from utils import Config argv = sys.argv[1:] config = Config(argv) env = gym.make('GazeboTurtlebotMazeColor-v0') replay = ExperienceReplay(config.args.output_dir, config.args.replay_buffer_size) qnet = Qnet(env.num_state, env.num_action) if (config.args.continue_from != None): qnet.load(config.args.continue_from) replay.load(config.args.continue_from) elif (config.args.from_pretrain != None): qnet.load(config.args.from_pretrain) epsilon = config.args.start_epsilon epsilon_decay = (config.args.start_epsilon - config.args.end_epsilon) / config.args.annealing_steps while True: state = env.reset()
np.random.seed(seed) tf.random.set_seed(seed) # Instead of env.step() and env.reset(), we'll use TreeActor helper class, which creates a tree and adds nodes to it env = gym.make(env_id) observe_fn = observe_pi_iw_dynamic if use_dynamic_feats else observe_pi_iw_BASIC actor = TreeActor(env, observe_fn=observe_fn) planner = RolloutIW(branching_factor=env.action_space.n, ignore_cached_nodes=True) model = Mnih2013(num_logits=env.action_space.n, add_value=False) optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, rho=rmsprop_decay, epsilon=rmsprop_epsilon) learner = SupervisedPolicy(model, optimizer, regularization_factor=regularization_factor, use_graph=True) experience_replay = ExperienceReplay(capacity=replay_capacity) def network_policy(node, branching_factor): return node.data["probs"] # Initialize experience replay: run some steps until we have enough examples to form one batch print("Initializing experience replay", flush=True) actor.reset() while len(experience_replay) < batch_size: r, episode_done = planning_step(actor=actor, planner=planner, dataset=experience_replay, policy_fn=network_policy, tree_budget=tree_budget, cache_subtree=cache_subtree, discount_factor=discount_factor)
class DQN: def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None): self.exp_history = ExperienceReplay(num_frame_stack, capacity=experience_capacity, pic_size=pic_size) self.playing_cache = ExperienceReplay(num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size) self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack, ) + self.pic_size self.global_counter = 0 self.episode_counter = 0 if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.q_values = [] self.loss_his = [] @staticmethod def process_image(img): return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1 def build_graph(self): input_dim_with_batch = (self.batchsize, self.num_frame_stack) + self.pic_size input_dim_general = (None, self.num_frame_stack) + self.pic_size self.input_prev_state = tf.placeholder(tf.float32, input_dim_general, "prev_state") self.input_next_state = tf.placeholder(tf.float32, input_dim_with_batch, "next_state") self.input_reward = tf.placeholder(tf.float32, self.batchsize, "reward") self.input_actions = tf.placeholder(tf.int32, self.batchsize, "actions") self.input_done_mask = tf.placeholder(tf.int32, self.batchsize, "done_mask") with tf.variable_scope("target"): qsa_targets = self.create_network(self.input_next_state, trainable=False) with tf.variable_scope("train"): qsa_estimates = self.create_network(self.input_prev_state, trainable=True) self.best_action = tf.argmax(qsa_estimates, axis=1) not_done = tf.cast( tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32") q_target = tf.reduce_max( qsa_targets, -1) * self.gamma * not_done + self.input_reward self.q_value_mean = tf.reduce_mean(q_target) action_slice = tf.stack( [tf.range(0, self.batchsize), self.input_actions], axis=1) q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice) training_loss = tf.nn.l2_loss( q_target - q_estimates_for_input_action) / self.batchsize optimizer = tf.train.AdamOptimizer(**(self.optimizer_params)) reg_loss = tf.add_n(tf.losses.get_regularization_losses()) self.loss = tf.reduce_mean(reg_loss + training_loss) self.train_op = optimizer.minimize(reg_loss + training_loss) train_params = self.get_variables("train") target_params = self.get_variables("target") try: self.copy_network_ops = [ tf.assign(target_v, train_v) for train_v, target_v in zip(train_params, target_params) ] except: print("error") def get_variables(self, scope): vars = [ t for t in tf.global_variables() if "%s/" % scope in t.name and "Adam" not in t.name ] return sorted(vars, key=lambda v: v.name) def create_network(self, input, trainable): if trainable: wr = slim.l2_regularizer(self.regularization) else: wr = None input_t = tf.transpose(input, [0, 2, 3, 1]) net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC", activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.conv2d(net, 16, (3, 3), data_format="NHWC", activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) net = slim.max_pool2d(net, 2, 2) net = slim.flatten(net) fc_1 = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) fc_2 = slim.fully_connected(net, 256, activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable) value = slim.fully_connected(fc_1, 1, activation_fn=None, weights_regularizer=wr, trainable=trainable) advantage = slim.fully_connected(fc_2, self.dim_actions, activation_fn=None, weights_regularizer=wr, trainable=trainable) q_state_action_values = value + (advantage - tf.reduce_mean( advantage, reduction_indices=[ 1, ], keepdims=True)) return q_state_action_values def check_early_stop(self, reward, totalreward): return False, 0.0 def get_random_action(self): return np.random.choice(self.dim_actions) def get_epsilon(self): if not self.do_training: return self.playing_epsilon elif self.global_counter >= self.epsilon_decay_steps: return self.min_epsilon else: # linear decay r = 1.0 - self.global_counter / float(self.epsilon_decay_steps) return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r def train(self): batch = self.exp_history.sample_mini_batch(self.batchsize) fd = { self.input_reward: "reward", self.input_prev_state: "prev_state", self.input_next_state: "next_state", self.input_actions: "actions", self.input_done_mask: "done_mask" } fd1 = {ph: batch[k] for ph, k in fd.items()} _, action_value, loss = self.session.run( [self.train_op, self.q_value_mean, self.loss], fd1) self.q_values.append(action_value) self.loss_his.append(loss) def play_episode(self): eh = (self.exp_history if self.do_training else self.playing_cache) total_reward = 0 frames_in_episode = 0 first_frame = self.env.reset() first_frame_pp = self.process_image(first_frame) eh.start_new_episode(first_frame_pp) while True: if np.random.rand() > self.get_epsilon(): action_idx = self.session.run(self.best_action, { self.input_prev_state: eh.current_state()[np.newaxis, ...] })[0] else: action_idx = self.get_random_action() if self.action_map is not None: action = self.action_map[action_idx] else: action = action_idx reward = 0 for _ in range(self.frame_skip): observation, r, done, info = self.env.step(action) if self.render: self.env.render() reward += r if done: break early_done, punishment = self.check_early_stop( reward, total_reward) if early_done: reward += punishment done = done or early_done total_reward += reward frames_in_episode += 1 eh.add_experience(self.process_image(observation), action_idx, done, reward) if self.do_training: self.global_counter += 1 if self.global_counter % self.network_update_freq: self.update_target_network() train_cond = ( self.exp_history.counter >= self.min_experience_size and self.global_counter % self.train_freq == 0) if train_cond: self.train() if done: if self.do_training: self.episode_counter += 1 q_value = np.mean(self.q_values) loss = np.mean(self.loss_his) self.q_values = [] self.loss_his = [] return total_reward, frames_in_episode, q_value, loss def update_target_network(self): self.session.run(self.copy_network_ops)
plan_step_fn = get_pi_iw_planning_step_fn( actor=actor, planner=planner, policy_fn=network_policy, tree_budget=tree_budget, discount_factor=discount_factor, temp=policy_temp) learner = SupervisedPolicy(model, optimizer, regularization_factor=regularization_factor, use_graph=True) # Initialize experience replay: run complete episodes until we exceed both batch_size and dataset_min_transitions print("Initializing experience replay", flush=True) train_stats = TrainStats() experience_replay = ExperienceReplay(capacity=replay_capacity) while len(experience_replay) < batch_size or len( experience_replay) < replay_min_transitions: episode_rewards = run_episode( plan_step_fn=plan_step_fn, learner=None, dataset=experience_replay, cache_subtree=cache_subtree, add_returns=(args.algorithm == "AlphaZero"), preproc_obs_fn=preproc_obs_fn, render=args.render) train_stats.report(episode_rewards, actor.nodes_generated) # Interleave planning and learning steps print("\nInterleaving planning and learning steps.", flush=True) while actor.nodes_generated < max_simulator_steps: