def train(self): t = trange(self.max_episode) for episode_count in t: state = self.env.reset() current_step = 0 episode_reward = 0 while True: if self.render_environment: self.env.render() # Actor select action, observe reward and state, then store them action = self.select_action(state, self.actor) next_state, reward, done, info = self.env.step(action) # Store transition episode_reward += reward self.store_memory(state, action, reward, next_state, done) # Batch train once enough samples in memory if len(self.memory) >= self.batch_size: self.memory = list(zip(*self.memory)) states, actions, rewards, next_states, dones = self.memory # Calculate advantage batch_values = self.critic.predict_on_batch( np.array(states)).ravel() batch_next_values = self.critic.predict_on_batch( np.array(next_states)).ravel() td_targets = rewards + self.gamma * ( 1 - np.array(dones)) * batch_next_values td_errors = td_targets - batch_values advantages = np.zeros((self.batch_size, self.num_actions)) for i in range(self.batch_size): advantages[i][actions[i]] = td_errors[i] # Train critic self.critic.train_on_batch(np.array(states), np.array(td_targets)) # Train actor self.actor.train_on_batch(np.array(states), np.array(advantages)) self.memory = [] # For logging and visualizing data if done or current_step > self.max_step: self.logger.log_history(episode_reward, episode_count) self.logger.show_progress(t, episode_reward, episode_count) if episode_count % self.logger.slide_window == 0: visualize(self.logger.rewards, self.logger.running_rewards, self.logger.episode_counts, os.path.join(self.result_path, "A2C.png")) break state = next_state current_step += 1
def train(self): t = trange(self.max_episode) for episode_count in t: state = self.env.reset() current_step = 0 while True: if self.render_environment: self.env.render() # Select action, observe reward and state, then store them action = self.select_action(state) next_state, reward, done, info = self.env.step(action) self.store_memory(state, action, reward) # Start training when an episode finishes if done or current_step > self.max_step: episode_reward = np.sum(self.rewards) # Train policy network every episode episode_length = len(self.states) discounted_rewards = self.discount_rewards(self.rewards) advantages = np.zeros((episode_length, self.num_actions)) for i in range(episode_length): advantages[i][self.actions[i]] = discounted_rewards[i] # history = model.fit(np.array(self.states), advantages, nb_epoch=1, verbose=0, batch_size=10) self.agent.train_on_batch(np.array(self.states), advantages) # For logging and visualizing data self.logger.log_history(episode_reward, episode_count) self.logger.show_progress(t, episode_reward, episode_count) if episode_count % self.logger.slide_window == 0: visualize( self.logger.rewards, self.logger.running_rewards, self.logger.episode_counts, os.path.join(self.result_path, "REINFORCE.png")) # Clear memory after episode finishes self.states, self.actions, self.rewards = [], [], [] break state = next_state current_step += 1
def train(self): # Setup environment first env = gym.make('CartPole-v1') env.seed(1) env = env.unwrapped self.num_actions = env.action_space.n self.num_states = env.observation_space.shape[0] actor = self.build_actor() critic = self.build_critic() max_episode = 100000 max_step = 10000 slide_window = 50 # Populate memory first state = env.reset() print("Warming up...") for i in range(self.batch_size): action = env.action_space.sample() next_state, reward, done, info = env.step(action) self.store_memory(1, state, action, reward, next_state, done) # Priority=1 for these transitions if done: state = env.reset() print("Warm up complete.") for episode_count in range(max_episode): state = env.reset() current_step = 0 episode_reward = 0 while True: #env.render() # Actor select action, observe reward and state, then store them action = self.select_action(state, actor) next_state, reward, done, info = env.step(action) # Store transition episode_reward += reward self.store_memory(state, action, reward, next_state, done) # Sample minibatch from memory minibatch = random.sample(self.memory, self.batch_size) # Transform the minibatch for processing minibatch = list(zip(*minibatch)) # Calculate advantage states, actions, rewards, next_states, dones = minibatch batch_values = critic.predict_on_batch( np.array(states)).ravel() batch_next_values = critic.predict_on_batch( np.array(next_states)).ravel() td_targets = rewards + self.gamma * ( 1 - np.array(dones)) * batch_next_values td_errors = td_targets - batch_values advantages = np.zeros((self.batch_size, self.num_actions)) for i in range(self.batch_size): advantages[i][actions[i]] = td_errors[i] # Train critic critic.train_on_batch(np.array(states), np.array(td_targets)) # Train actor actor.train_on_batch(np.array(states), np.array(advantages)) # For logging data if done or current_step > max_step: visualize(episode_reward, episode_count, slide_window, "A2C_Offpolicy.png") break state = next_state current_step += 1
def train(self): # Setup environment first env = gym.make('CartPole-v1') env.seed(1) env = env.unwrapped self.num_actions = env.action_space.n self.num_states = env.observation_space.shape[0] # Dummy inputs used during prediction self.dummy_input = np.zeros((1, self.num_actions)) self.dummy_batch_input = np.zeros((self.batch_size, self.num_actions)) actor = self.build_actor() old_actor = self.build_actor() old_actor.set_weights(actor.get_weights()) critic = self.build_critic() max_episode = 100000 max_step = 10000 slide_window = 100 for episode_count in range(max_episode): state = env.reset() current_step = 0 episode_reward = 0 while True: #env.render() # Actor select action, observe reward and state action = self.select_action(state, actor) next_state, reward, done, info = env.step(action) # Store transition episode_reward += reward self.store_memory(state, action, reward, next_state, done) # Batch train once enough samples in memory if len(self.memory) >= self.batch_size: self.memory = list(zip(*self.memory)) states, actions, rewards, next_states, dones = self.memory # Calculate advantage batch_values = critic.predict_on_batch(np.array(states)).ravel() batch_next_values = critic.predict_on_batch(np.array(next_states)).ravel() old_policies = old_actor.predict_on_batch([np.array(states), self.dummy_batch_input, self.dummy_batch_input]) td_targets = rewards + self.gamma * (1 - np.array(dones)) * batch_next_values td_errors = td_targets - batch_values advantages = np.zeros((self.batch_size, self.num_actions)) one_hot_actions = np.zeros((self.batch_size, self.num_actions)) for i in range(self.batch_size): advantages[i][actions[i]] = td_errors[i] one_hot_actions[i][actions[i]] = 1 # Train critic critic.train_on_batch(np.array(states), np.array(td_targets)) # Train actor actor.train_on_batch([np.array(states), advantages, old_policies], one_hot_actions) # Update old_actor weights to use for next step old_actor.set_weights(self.alpha*np.array(actor.get_weights()) + (1-self.alpha)*np.array(old_actor.get_weights())) self.memory = [] # For logging data if done or current_step > max_step: visualize(episode_reward, episode_count, slide_window, "PPO.png") break state = next_state current_step += 1
def train(self): # Setup environment first env = gym.make('CartPole-v1') env.seed(1) env = env.unwrapped self.num_actions = env.action_space.n self.num_states = env.observation_space.shape[0] # Initialize q_network and target_q_network q_network = self.build_network() target_q_network = self.build_network() target_q_network.set_weights(q_network.get_weights()) max_episode = 100000 max_step = 10000 slide_window = 100 # Populate memory first state = env.reset() print("Warming up...") for i in range(self.batch_size): action = env.action_space.sample() next_state, reward, done, info = env.step(action) self.store_memory(1, state, action, reward, next_state, done) # Priority=1 for these transitions if done: state = env.reset() print("Warm up complete.") for episode_count in range(max_episode): state = env.reset() current_step = 0 episode_reward = 0 while True: #env.render() # Network predict q_values = q_network.predict( np.reshape(state, (1, self.num_states))).ravel() action = np.argmax(q_values) # Perform action next_state, reward, done, info = env.step(action) # Calculate priority next_q_values = target_q_network.predict( np.reshape(next_state, (1, self.num_states))).ravel() next_action = np.argmax( q_network.predict( np.reshape(next_state, (1, self.num_states))).ravel()) td_error = reward + self.gamma * ( 1 - done) * next_q_values[next_action] - q_values[ action] # Note that the td_error is not ^2 like in DQN priority = (abs(td_error) + self.constant)**self.alpha # Store transition episode_reward += reward self.store_memory(priority, state, action, reward, next_state, done) if current_step % self.replay_frequency is 0: # Sample minibatch from memory based on their priority minibatch = [] ISWeights = [] min_prob = np.min(self.memory.tree[-self.memory.capacity] / self.memory.total()) T = self.memory.total() // self.batch_size for i in range(self.batch_size): a, b = T * i, T * (i + 1) s = random.uniform(a, b) idx, priority, data = self.memory.get(s) probability = priority / self.memory.total() ISWeights.append( np.power(probability / min_prob, -self.beta)) minibatch.append((*data, idx)) self.beta = np.min( [self.beta_max, self.beta + self.beta_increase]) # Transform the minibatch for processing minibatch = list(zip(*minibatch)) # Calculate all td_targets for current minibatch states, actions, rewards, next_states, dones, indices = minibatch batch_q_values = q_network.predict_on_batch( np.array(states)) batch_next_q_values = target_q_network.predict_on_batch( np.array(next_states)) next_actions = np.argmax(q_network.predict_on_batch( np.array(next_states)), axis=1) td_targets = batch_q_values.copy() for i in range(self.batch_size): td_targets[i][actions[i]] = rewards[i] + self.gamma * ( 1 - dones[i]) * batch_next_q_values[i][next_actions[i]] # Need to recalculate priorities for transitions in minibatch priority = (abs(td_targets[i][actions[i]] - batch_q_values[i][actions[i]]) + self.constant)**self.alpha self.memory.update(indices[i], priority) # Train network q_network.train_on_batch(np.array(states), np.array(td_targets), np.array(ISWeights)) # Hard copy q_network to target_q_network if done or current_step % self.update_frequency is 0: target_q_network.set_weights(q_network.get_weights()) # For logging data if done or current_step > max_step: visualize(episode_reward, episode_count, slide_window, "Noisy_DDDQN_PER_Prop.png") break state = next_state current_step += 1
def train(self): # Initialize q_network and target_q_network q_network = self.build_agent() target_q_network = self.build_agent() target_q_network.set_weights(q_network.get_weights()) # Populate memory first state = self.env.reset() print("Warming up...") while len(self.memory) < self.batch_size: action = self.env.action_space.sample() next_state, reward, done, info = self.env.step(action) self.store_memory(state, action, reward, next_state, done) if done: state = self.env.reset() print("Warm up complete.") t = trange(self.max_episode) for episode_count in t: state = self.env.reset() current_step = 0 episode_reward = 0 while True: if self.render_environment: self.env.render() # Network predict q_values = q_network.predict( np.reshape(state, (1, self.num_states))).ravel() # Decide if exploring or not if np.random.rand() >= self.epsilon: action = np.argmax(q_values) else: action = random.randrange(self.num_actions) # Perform action next_state, reward, done, info = self.env.step(action) # Store transition episode_reward += reward self.store_memory(state, action, reward, next_state, done) # Decrease exploration if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # Sample minibatch from memory minibatch = random.sample(self.memory, self.batch_size) # Transform the minibatch for processing minibatch = list(zip(*minibatch)) # Calculate all td_targets for current minibatch states, actions, rewards, next_states, dones = minibatch batch_q_values = q_network.predict_on_batch(np.array(states)) batch_next_q_values = target_q_network.predict_on_batch( np.array(next_states)) max_next_q_values = np.max(batch_next_q_values, axis=1) td_targets = batch_q_values.copy() for i in range(self.batch_size): td_targets[i][actions[i]] = rewards[i] + self.gamma * ( 1 - dones[i]) * max_next_q_values[i] # Train network q_network.train_on_batch(np.array(states), np.array(td_targets)) # Copy q_network to target_q_network if done or current_step % self.update_frequency is 0: target_q_network.set_weights(q_network.get_weights()) # For logging and visualizing data if done or current_step > self.max_step: self.logger.log_history(episode_reward, episode_count) self.logger.show_progress(t, episode_reward, episode_count) if episode_count % self.logger.slide_window == 0: visualize(self.logger.rewards, self.logger.running_rewards, self.logger.episode_counts, os.path.join(self.result_path, "DQN.png")) break state = next_state current_step += 1
def train(self): # Setup environment first env = gym.make('MountainCarContinuous-v0') env.seed(1) env = env.unwrapped self.num_actions = env.action_space.shape[0] self.num_states = env.observation_space.shape[0] # Dummy inputs used during prediction self.dummy_input = np.zeros((1, self.num_actions)) self.dummy_batch_input = np.zeros((self.batch_size, self.num_actions)) actor = self.build_actor() critic = self.build_critic() max_episode = 100000 max_step = 10000 slide_window = 20 for episode_count in range(max_episode): state = env.reset() current_step = 0 episode_reward = 0 while True: #env.render() # Actor select action, observe reward and state, then store them action = self.select_action(state, actor) action = np.clip(action, env.action_space.low[0], env.action_space.high[0]) next_state, reward, done, info = env.step(action) # Store transition episode_reward += reward self.store_memory(state, action, reward, next_state, done) # Batch train once enough samples in memory if len(self.memory) >= self.batch_size: self.memory = list(zip(*self.memory)) states, actions, rewards, next_states, dones = self.memory # Calculate advantage batch_values = critic.predict_on_batch( np.array(states)).ravel() batch_next_values = critic.predict_on_batch( np.array(next_states)).ravel() td_targets = rewards + self.gamma * ( 1 - np.array(dones)) * batch_next_values td_errors = td_targets - batch_values advantages = np.zeros((self.batch_size, self.num_actions)) batch_actions = np.zeros( (self.batch_size, self.num_actions)) for i in range(self.batch_size): for j in range(self.num_actions): advantages[i][j] = td_errors[i] batch_actions[i][j] = actions[i] # Train critic critic.train_on_batch(np.array(states), np.array(td_targets)) # Train actor actor.train_on_batch( [np.array(states), advantages, batch_actions], [self.dummy_batch_input, self.dummy_batch_input]) self.memory = [] # For logging data if done or current_step > max_step: visualize(episode_reward, episode_count, slide_window, "A2C_Continuous.png") break state = next_state current_step += 1
def train(self): # Setup environment first env = gym.make('MountainCarContinuous-v0') env.seed(1) env = env.unwrapped self.num_actions = env.action_space.shape[0] self.num_states = env.observation_space.shape[0] self.noise = OrnsteinUhlenbeckProcess(size=self.num_actions) actor = self.build_actor() target_actor = self.build_actor() target_actor.set_weights(actor.get_weights()) critic = self.build_critic() target_critic = self.build_critic() target_critic.set_weights(critic.get_weights()) q_gradients_connect = K.gradients(critic.output, critic.input[1]) max_episode = 100000 max_step = 1000 slide_window = 50 # Populate memory first state = env.reset() print("Warming up...") while len(self.memory) < self.batch_size: action = env.action_space.sample() next_state, reward, done, info = env.step(action) self.store_memory(state, action, reward, next_state, done) if done: state = env.reset() print("Warm up complete.") sess = K.get_session() for episode_count in range(max_episode): state = env.reset() current_step = 0 episode_reward = 0 while True: #env.render() # Actor select action, observe reward and state, then store them action = self.select_action(state, actor, current_step, env.action_space.low[0], env.action_space.high[0]) next_state, reward, done, info = env.step(action) # Store transition episode_reward += reward self.store_memory(state, action, reward, next_state, done) # Sample minibatch from memory minibatch = random.sample(self.memory, self.batch_size) # Transform the minibatch for processing minibatch = list(zip(*minibatch)) # Get td target states, actions, rewards, next_states, dones = minibatch next_actions = [self.select_action(next_state, target_actor, current_step, env.action_space.low[0], env.action_space.high[0]) for next_state in next_states] batch_next_q_values = target_critic.predict_on_batch([np.array(next_states), np.array(next_actions)]).ravel() # Action from policy is considered argmax of Q-value network td_targets = rewards + self.gamma * (1 - np.array(dones)) * batch_next_q_values # Get gradient of critic output wrt to the action input #tmp_actions = actor.predict_on_batch(np.array(states)) #tmp_actions = np.clip(tmp_actions, env.action_space.low[0], env.action_space.high[0]) #actions_for_gradients = critic.predict_on_batch([np.array(states), tmp_actions]) actions_for_gradients = critic.predict_on_batch([np.array(states), np.array(actions)]) q_gradients = sess.run(q_gradients_connect, feed_dict={critic.input[0]: np.array(states), critic.input[1]: actions_for_gradients}) # Train critic critic.train_on_batch([np.array(states), np.array(actions)], np.array(td_targets)) # Train actor actor.train_on_batch(np.array(states), q_gradients) # Soft update target actor and critic target_actor.set_weights((self.alpha * np.array(actor.get_weights()) + (1 - self.alpha) * np.array(target_actor.get_weights()))) target_critic.set_weights((self.alpha * np.array(critic.get_weights()) + (1 - self.alpha) * np.array(target_critic.get_weights()))) # For logging data if done or current_step > max_step: visualize(episode_reward, episode_count, slide_window, "DDPG.png") break state = next_state current_step += 1
# Map runtime settings cfg.planner.runs = args.runs or cfg.planner.runs cfg.model.use = args.model cfg.planner.reachability = args.reachability or cfg.planner.reachability # Post process the results and exit the main application if args.post_process: from utils.post_process import process process(args.folder) exit() # If we're not post processing, run experiments and move results to folder from deep_rrt import DeepRRT for exp_n in range(args.epochs): deepRRT = DeepRRT(exp_n + 1) deepRRT.run() # Move experiment to a separate folder loc = os.path.join(cfg.paths.tmp, args.folder) run_path = deepRRT.move_run_to(loc) # Free up memory del deepRRT gc.collect() # We can also visualize the data if args.visualize: from utils.visualizer import visualize visualize(run_path)
def train(self): # Initialize q_network and target_q_network self.target_q_network.set_weights(self.q_network.get_weights()) # Populate memory first state = self.env.reset() print("Warming up...") for i in range(self.batch_size): action = self.env.action_space.sample() next_state, reward, done, info = self.env.step(action) self.store_memory(1, state, action, reward, next_state, done) # Priority=1 for these transitions if done: state = self.env.reset() print("Warm up complete.") t = trange(self.max_episode) for episode_count in t: state = self.env.reset() current_step = 0 episode_reward = 0 while True: if self.render_environment: self.env.render() # Network predict q_values = self.q_network.predict( np.reshape(state, (1, self.num_states))).ravel() # Decide if exploring or not if np.random.rand() >= self.epsilon: action = np.argmax(q_values) else: action = random.randrange(self.num_actions) # Perform action next_state, reward, done, info = self.env.step(action) # Calculate priority next_q_values = self.target_q_network.predict( np.reshape(next_state, (1, self.num_states))).ravel() next_action = np.argmax( self.q_network.predict( np.reshape(next_state, (1, self.num_states))).ravel()) td_error = reward + self.gamma * ( 1 - done) * next_q_values[next_action] - q_values[ action] # Note that the td_error is not ^2 like in DQN priority = (abs(td_error) + self.constant)**self.alpha # Store transition episode_reward += reward self.store_memory(priority, state, action, reward, next_state, done) # Decrease exploration if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay if current_step % self.replay_frequency is 0: # Sample minibatch from memory based on their priority minibatch = [] ISWeights = [] min_prob = np.min(self.memory.tree[-self.memory.capacity] / self.memory.total()) T = self.memory.total() // self.batch_size for i in range(self.batch_size): a, b = T * i, T * (i + 1) s = random.uniform(a, b) idx, priority, data = self.memory.get(s) probability = priority / self.memory.total() ISWeights.append( np.power(probability / min_prob, -self.beta)) minibatch.append((*data, idx)) self.beta = np.min( [self.beta_max, self.beta + self.beta_increase]) # Transform the minibatch for processing minibatch = list(zip(*minibatch)) # Calculate all td_targets for current minibatch states, actions, rewards, next_states, dones, indices = minibatch batch_q_values = self.q_network.predict_on_batch( np.array(states)) batch_next_q_values = self.target_q_network.predict_on_batch( np.array(next_states)) next_actions = np.argmax(self.q_network.predict_on_batch( np.array(next_states)), axis=1) td_targets = batch_q_values.copy() for i in range(self.batch_size): td_targets[i][actions[i]] = rewards[i] + self.gamma * ( 1 - dones[i]) * batch_next_q_values[i][next_actions[i]] # Need to recalculate priorities for transitions in minibatch priority = (abs(td_targets[i][actions[i]] - batch_q_values[i][actions[i]]) + self.constant)**self.alpha self.memory.update(indices[i], priority) # Train network self.q_network.train_on_batch(np.array(states), np.array(td_targets), np.array(ISWeights)) # Hard copy q_network to target_q_network if done or current_step % self.update_frequency is 0: self.target_q_network.set_weights( self.q_network.get_weights()) # For logging and visualizing data if done or current_step > self.max_step: self.logger.log_history(episode_reward, episode_count) t.set_description("Episode: {}, Reward: {}".format( episode_count, episode_reward)) t.set_postfix(running_reward="{:.2f}".format( self.logger.running_rewards[-1])) if episode_count % self.logger.slide_window == 0: visualize( self.logger.rewards, self.logger.running_rewards, self.logger.episode_counts, os.path.join(self.result_path, "DDQN_PER_Prop.png")) break state = next_state current_step += 1
from src.Problem.Selectors.variable_selector import DegreeVariableSelector from src.Problem.algorithm import AC3 from src.Problem.solver import CSPSolver from utils.visualizer import visualize REGIONS_NUM = 15 PLANE_SIZE = 50 logging.basicConfig(level=logging.DEBUG) if __name__ == '__main__': problem, graph = generate_problem(REGIONS_NUM, PLANE_SIZE) with open('some_graph.pickle', 'wb') as f: pickle.dump(graph, f) # with open('some_graph.pickle', 'rb') as f: # graph = pickle.load(f) logging.info(f"____________MAP_COLORING____________") solver = CSPSolver(problem=problem, algorithm_type=AC3, variable_selector=DegreeVariableSelector, value_selector=LeastConstrainingValueSelector, use_ac3=True) solutions = solver.get_solutions() result = solutions[0] ordered = [result[region].value for region in graph.neighbours] graph.to_json( PLANE_SIZE, f"graphs/{PLANE_SIZE}x{PLANE_SIZE}_{REGIONS_NUM}reg_csp.json", ordered) visualize(f"graphs/{PLANE_SIZE}x{PLANE_SIZE}_{REGIONS_NUM}reg_csp.json")