def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, learning_rate=0.1, discount_rate=0.99, eps_decay=0.9, tau=0.1, steps_per_update=4): self.device = device self.state_size = state_size self.action_size = action_size self.q_network_control = QNetwork(state_size, action_size).to(device) self.q_network_target = QNetwork(state_size, action_size).to(device) self.optimizer = torch.optim.Adam(self.q_network_control.parameters(), lr=learning_rate) self.batch_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size) self.discount_rate = discount_rate self.eps = 1.0 self.eps_decay = eps_decay self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update
def __init__(self, env, state_space, action_space, device, learning_rate, buffer_size, \ batch_size, gamma, in_channels, train_freq = 4, target_update_freq=1e4, is_ddqn = False): self.env = env self.state_space = state_space self.action_space = action_space self.QNetwork_local = QNetwork(in_channels, self.state_space, self.action_space.n, device = device).to(device) self.QNetwork_local.init_weights() self.QNetwork_target = QNetwork(in_channels, self.state_space,self.action_space.n , device = device).to(device) self.QNetwork_target.load_state_dict(self.QNetwork_local.state_dict()) self.optimizer = torch.optim.RMSprop(self.QNetwork_local.parameters(), lr=learning_rate, alpha=0.95, eps=0.01, centered=True) self.criterion = torch.nn.MSELoss() self.memory = ReplayBuffer(capacity=int(buffer_size), batch_size=batch_size) self.step_count = 0. self.batch_size = batch_size self.gamma = gamma self.device = device self.buffer_size = buffer_size self.num_train_updates = 0 self.train_freq = train_freq self.target_update_freq = target_update_freq self.is_ddqn = is_ddqn print('Agent initialized with memory size:{}'.format(buffer_size))
def main(): env = gym.make('CartPole-v1') agent = QNetwork(env.observation_space, env.action_space) frame = 0 while frame < MAX_NUM_FRAMES: state = env.reset() for t in range(MAX_EPISODE_DURATION): env.render() action = agent.act(state) next_state, reward, done, info = env.step(action) if done: # done doesn't return a negative reward... reward *= -1 frame = frame + 1 if done or frame == MAX_NUM_FRAMES: break state = next_state env.close()
def _init_modules(self): # Replay memory self.replay_memory = ReplayMemory(history_len=self.num_frames, capacity=self.capacity, batch_size=self.batch_size, input_scale=self.input_scale) input_shape = self.feedback_size + (self.num_frames, ) # Q-network self.q_network = QNetwork(input_shape=input_shape, n_outputs=len(self.actions), network_type=self.config['network_type'], scope='q_network') # Target network self.target_network = QNetwork( input_shape=input_shape, n_outputs=len(self.actions), network_type=self.config['network_type'], scope='target_network') # Optimizer self.optimizer = Optimizer(config=self.config, feedback_size=self.feedback_size, q_network=self.q_network, target_network=self.target_network, replay_memory=self.replay_memory) # Ops for updating target network self.clone_op = self.target_network.get_clone_op(self.q_network) # For tensorboard self.t_score = tf.placeholder(dtype=tf.float32, shape=[], name='new_score') tf.summary.scalar("score", self.t_score, collections=['dqn']) self.summary_op = tf.summary.merge_all('dqn')
class Agent(): def __init__(self, state_size, action_size, fc_units, seed, lr, buffer_size, batch_size, update_every): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.update_every = update_every self.qnetwork_local = QNetwork(state_size, action_size, seed, fc_units).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc_units).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.5) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done, gamma, tau): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.update_every if (self.t_step == 0) and (len(self.memory) > self.memory.batch_size): experiences = self.memory.sample() self.learn(experiences, gamma, tau) def act(self, state, eps): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: action = np.argmax(action_values.cpu().data.numpy()) else: action = random.choice(np.arange(self.action_size)) return action def learn(self, experiences, gamma, tau): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + gamma * Q_targets_next * (1 - dones) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, tau) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def build_networks(self): self.learning_network = QNetwork( name=self.hyperparameters["agent_name"] + "_learning_q_network", env=self.env, hyperparameters=self.hyperparameters, placeholders=self.placeholders, sess=self.sess) self.target_network = QNetwork( name=self.hyperparameters["agent_name"] + "_target_q_network", env=self.env, hyperparameters=self.hyperparameters, placeholders=self.placeholders, sess=self.sess)
def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, fc_units, seed, lr, buffer_size, batch_size, update_every): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.update_every = update_every self.qnetwork_local = QNetwork(state_size, action_size, seed, fc_units).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc_units).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.5) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) self.t_step = 0
def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95): """ Creates a DQN agent. :param env: game environment. :type env: Class Snake_Env(). :param hidden_units: number of neurons in each layer. :type hidden_units: tupple with dimension (1, 3). :param network_LR: learning rate of the action-value neural network. :type network_LR: float. :param batch_size: size of the minibatch taken from the replay buffer. :type batch_size: int. :param update_every: number of iterations for updating the target qnetwork. :type update_every: int :param gamma: discount factor. :type gamma: float. """ self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) self.ACTION_SIZE = env.ACTION_SPACE self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #Temp variable self.t = 0
def __init__(self, state_size, action_size, seed, training, pixels, lr=LR): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0. self.pixels = pixels if pixels is False: from q_network import QNetwork else: from q_network_cnn import QNetwork print('loaded cnn network') self.loader = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) self.QN_local = QNetwork(state_size, action_size, seed, training).to(device) self.QN_target = QNetwork(state_size, action_size, seed, training).to(device) self.optimizer = optim.Adam(self.QN_local.parameters(), lr=lr) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # TODO
def main(env_name, optim, hiddens, eps_decay, memory_length, epochs, batch_size): env = gym.make(env_name) if hiddens: hidden_sizes = map(int, hiddens.split(',')) else: hidden_sizes = [] q_network = QNetwork(inps=env.observation_space.shape[0], outs=env.action_space.n, hidden_sizes=hidden_sizes, str_optim=optim) policy = EpsilonGreedyPolicy(start_eps=1.0, eps_decay=eps_decay, action_values=q_network.get_action_values, n_actions=env.action_space.n) memory = Memory(max_length=memory_length) Transition = namedtuple( 'Transition', ["state", "action", "reward", "state_next", "done"]) for epoch in range(epochs): state = env.reset() epoch_reward = 0 for step in range(env.spec.max_episode_steps): action = policy.act(state) next_state, reward, done, _ = env.step(action) memory.store(Transition(state, action, reward, next_state, done)) try: q_network.fit_transitions(memory.random_sample(batch_size)) except MemTooSmallError: # memory not filled yet pass policy.update_epsilon() epoch_reward += reward state = next_state if done: break print 'Episode {}/{} | Total reward: {} | Epsilon: {:.4f}'\ .format(epoch, epochs, epoch_reward, policy.epsilon) return
def __init__(self, env, hidden_units=None, network_LR=0.001, batch_size=64, update_every=4, gamma=1.0): self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) #this is pythonic self.nA = env.ACTION_SPACE #number of actions agent can perform self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every #let's give it some brains self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE, hidden_units=self.HIDDEN_UNITS, output_size=self.nA, learning_rate=self.NETWORK_LR) print(self.qnetwork_local.model.summary()) #I call the target network as the PC # Where our agent stores all the concrete and important stuff self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE, hidden_units=self.HIDDEN_UNITS, output_size=self.nA, learning_rate=self.NETWORK_LR) #and the memory of course self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #handy temp variable self.t = 0
def main(): env = gym.make('CartPole-v1') replay_memory = ReplayMemory() agent = QNetwork(env.observation_space, env.action_space) get_training_batch = make_training_transformer(env.observation_space.shape, agent) frame = 0 acc_loss = 0 acc_state_value = 0 while frame < MAX_NUM_FRAMES: state = env.reset() for t in range(MAX_EPISODE_DURATION): if take_random_action(frame): action = env.action_space.sample() # pick random action else: action = agent.act(state) next_state, reward, done, info = env.step(action) if done: # on done doesn't return a negative reward... reward *= -1 experience = (state, action, reward, next_state, done) replay_memory.append(experience) frame += 1 experience_samples = replay_memory.sample(BATCH_SIZE) state_batch, qs_batch = get_training_batch(experience_samples) acc_state_value += np.mean(qs_batch) loss = agent.training_step(state_batch, qs_batch) acc_loss += loss if frame % FRAMES_PER_SAVE == 0: model_filename = f"ckpt-loss={loss:.4f}" agent.save_model(model_filename) if frame % FRAMES_PER_PRINT == 0: print(f"Frame: {frame}") avg_loss = acc_loss / FRAMES_PER_PRINT avg_state_value = acc_state_value / FRAMES_PER_PRINT print( f"avg loss: {avg_loss:.4f}; avg value: {avg_state_value:.2f}" ) acc_loss = 0 acc_state_value = 0 if done or frame == MAX_NUM_FRAMES: break state = next_state env.close()
def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['l1'][0] = state_size nn_params['l5'][1] = action_size self.__learning_mode = params['learning_mode'] if self.__learning_mode['DuelingDDQN']: self.__qnetwork_local = DuelingQNetwork(nn_params).to(device) self.__qnetwork_target = DuelingQNetwork(nn_params).to(device) else: self.__qnetwork_local = QNetwork(nn_params).to(device) self.__qnetwork_target = QNetwork(nn_params).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.eps = params['eps_initial'] self.gamma = params['gamma'] self.learning_rate = params['learning_rate'] self.update_period = params['update_period'] self.a = params['a'] self.b = params['b'] self.e = params['e'] self.tau = params['tau'] self.__optimiser = optim.Adam(self.__qnetwork_local.parameters(), self.learning_rate) # other parameters self.agent_loss = 0.0
def main(): """ Main function for training the DQN network in learning how to accomplish autonomous landing. """ # ATTENTION: If you want to restore files from a previous simulation you # must pass valid values for these variables: policy_weights_path = '/home/pulver/Desktop/episode_113250/policy/policy_checkpoint.ckp' summary_folder = "" # if empty the summary is written in ./log/ + current time screen_width = 84 # original is 160 screen_height = 84 # original is 210 images_stack_size = 4 # Use only the first 5 actions for this simulation # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend'] # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend) tot_actions = 6 batch_size = 32 # size of the experience batch tot_steps = 1000 # finite-horizont simulation # 10x10^6 high number since training can be stopped at every time steps_per_episodes = 40 # expressed in number step noop_time = 2.0 # pause in seconds between actions timer_total_start = time.time() rospy.init_node("DRLTestNode") rospy.loginfo("----- DRL Test Node -----") # Create a subscriber fot the greyscale image rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback) # TODO # restore default # rospy.Subscriber("/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30) # Store the last 30 messages # before discarding them images_stack_size = 4 r = rospy.Rate(10) # 10hz # Init session and networks sess = tf.Session() if (summary_folder == ""): tf_summary_writer = tf.summary.FileWriter( './log/' + str(datetime.datetime.now().time()), sess.graph) else: tf_summary_writer = tf.summary.FileWriter(summary_folder, sess.graph) policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=(screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net") init = tf.global_variables_initializer() sess.run(init) if (policy_weights_path != ""): print("Loading weights from memory: " + str(policy_weights_path)) policy_network.load_weights(policy_weights_path) else: print("The networks path are empty. Choose appropriate weights!") sys.exit() ########################################################################## ## START HERE ## ########################################################################## timer_start = time.time() actual_time = rospy.get_rostime() rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0 frame_episode = 0 cumulated_reward = 0 send_action("takeoff") # 1-Accumulate the first state send_action("stop") rospy.sleep(1.0) # acquire image from bottomcamera # get_image() state = _last_image # create a stack of X images image_t = np.stack([state] * images_stack_size, axis=2) for step in range(tot_steps): # 2- Get the action following epsilon-greedy or through policy network. # Here image_t is always ready and it can be given as input to # the network action_distribution = policy_network.return_action_distribution( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False) # print "" # print action_distribution action = np.argmax(action_distribution) action = convert_action_int_to_str(action) # print action send_action(action) rospy.sleep(noop_time) # 3- Get the state_t1 repeating the action obtained at step-2 and add everything # in the replay buffer. Then set state_t = state_t1 # get_image() image_t1 = _last_image send_action("stop") # If the action taken is to land and the UAV is inside the landing # BB, done will be calculated accordingly # NOTE: in the real implementation there's no way to get done and reward # done_reward = get_done_reward() # reward = done_reward.reward # done = done_reward.done image_t1 = np.expand_dims(image_t1, 2) # stack the images image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2) frame_episode += 1 image_t = image_t1 # cumulated_reward += reward # At every step check if more than 30 seconds from the start passed. # In that case, set done = True and end the episode timer_stop = time.time() # Stop the episode if the number of frame is more than a threshold if frame_episode >= steps_per_episodes: done = True # When the episode is done if done: timer_stop = time.time() actual_time = rospy.get_rostime() rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0 rospy_time_elapsed = rospy_stop_time - rospy_start_time print("Time episode: " + str(timer_stop - timer_start) + " seconds") print("Ros time episode: " + str(rospy_time_elapsed) + " seconds") # if cumulated_reward >= 0: # rospy.logwarn("Positive reward obtained!") # print("Cumulated reward: " + str(cumulated_reward)) print("Episode finished after {} timesteps".format(step + 1)) sys.stdout.flush() break
class Agent(): def __init__(self, state_size, action_size, seed, training, pixels, lr=LR): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0. self.pixels = pixels if pixels is False: from q_network import QNetwork else: from q_network_cnn import QNetwork print('loaded cnn network') self.loader = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) self.QN_local = QNetwork(state_size, action_size, seed, training).to(device) self.QN_target = QNetwork(state_size, action_size, seed, training).to(device) self.optimizer = optim.Adam(self.QN_local.parameters(), lr=lr) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # TODO def act(self, state, eps): # if self.pixels is True: #state = Variable(torch.from_numpy(state).float().to(device).view(state.shape[0],3,32,32)) if not self.pixels: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.QN_local.eval() if torch.no_grad(): action_values = self.QN_local(state) self.QN_local.train() if random.random() > eps: return int(np.argmax(action_values.cpu().data.numpy())) else: return int(random.choice(np.arange(self.action_size))) def step(self, state, action, reward, next_state, done, stack_size): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_RATE if self.t_step == 0 and len(self.memory) > BATCH_SIZE: samples = self.memory.sample() self.learn(samples, GAMMA, stack_size) def learn(self, experiences, gamma, stack_size): states, actions, rewards, next_states, dones = experiences if self.pixels: next_states = Variable( next_states ) #next_states.view(next_states.shape[0],stack_size,3, stack_size,32,32)) states = Variable(states) #states.view(states.shape[0],3,64,64)) # else: #todo bring back the old version stuff here _target = self.QN_target(next_states).detach().max(1)[0].unsqueeze(1) action_values_target = rewards + gamma * _target * (1 - dones) action_values_expected = self.QN_local(states).gather(1, actions) loss = F.mse_loss(action_values_expected, action_values_target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target Qnetwork for target_param, local_param in zip(self.QN_target.parameters(), self.QN_local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)
def main(): parser = argparse.ArgumentParser( 'a program to train or run a deep q-learning agent') parser.add_argument("game", type=str, help="name of game to play") parser.add_argument("agent_type", type=str, help="name of learning/acting technique used") parser.add_argument("agent_name", type=str, help="unique name of this agent instance") parser.add_argument("--rom_path", type=str, help="path to directory containing atari game roms", default='../roms') parser.add_argument( "--watch", help= "if true, a pretrained model with the specified name is loaded and tested with the game screen displayed", action='store_true') parser.add_argument("--epochs", type=int, help="number of epochs", default=200) parser.add_argument("--epoch_length", type=int, help="number of steps in an epoch", default=250000) parser.add_argument("--test_steps", type=int, help="max number of steps per test", default=125000) parser.add_argument("--test_steps_hardcap", type=int, help="absolute max number of steps per test", default=135000) parser.add_argument("--test_episodes", type=int, help="max number of episodes per test", default=30) parser.add_argument("--history_length", type=int, help="number of frames in a state", default=4) parser.add_argument("--training_frequency", type=int, help="number of steps run before training", default=4) parser.add_argument( "--random_exploration_length", type=int, help= "number of randomly-generated experiences to initially fill experience memory", default=50000) parser.add_argument("--initial_exploration_rate", type=float, help="initial exploration rate", default=1.0) parser.add_argument("--final_exploration_rate", type=float, help="final exploration rate from linear annealing", default=0.1) parser.add_argument( "--final_exploration_frame", type=int, help="frame at which the final exploration rate is reached", default=1000000) parser.add_argument("--test_exploration_rate", type=float, help="exploration rate while testing", default=0.05) parser.add_argument("--frame_skip", type=int, help="number of frames to repeat chosen action", default=4) parser.add_argument("--screen_dims", type=tuple, help="dimensions to resize frames", default=(84, 84)) # used for stochasticity and to help prevent overfitting. # Must be greater than frame_skip * (observation_length -1) + buffer_length - 1 parser.add_argument("--max_start_wait", type=int, help="max number of frames to wait for initial state", default=60) # buffer_length = 1 prevents blending parser.add_argument("--buffer_length", type=int, help="length of buffer to blend frames", default=2) parser.add_argument("--blend_method", type=str, help="method used to blend frames", choices=('max'), default='max') parser.add_argument("--reward_processing", type=str, help="method to process rewards", choices=('clip', 'none'), default='clip') # must set network_architecture to custom in order use custom architecture parser.add_argument( "--conv_kernel_shapes", type=tuple, help= "shapes of convnet kernels: ((height, width, in_channels, out_channels), (next layer))" ) # must have same length as conv_kernel_shapes parser.add_argument( "--conv_strides", type=tuple, help="connvet strides: ((1, height, width, 1), (next layer))") # currently, you must have at least one dense layer parser.add_argument( "--dense_layer_shapes", type=tuple, help="shapes of dense layers: ((in_size, out_size), (next layer))") parser.add_argument("--discount_factor", type=float, help="constant to discount future rewards", default=0.99) parser.add_argument("--learning_rate", type=float, help="constant to scale parameter updates", default=0.00025) parser.add_argument("--optimizer", type=str, help="optimization method for network", choices=('rmsprop', 'graves_rmsprop'), default='rmsprop') parser.add_argument("--rmsprop_decay", type=float, help="decay constant for moving average in rmsprop", default=0.95) parser.add_argument("--rmsprop_epsilon", type=int, help="constant to stabilize rmsprop", default=0.01) # set error_clipping to less than 0 to disable parser.add_argument( "--error_clipping", type=str, help="constant at which td-error becomes linear instead of quadratic", default=1.0) # set gradient clipping to 0 or less to disable. Currently only works with graves_rmsprop. parser.add_argument("--gradient_clip", type=str, help="clip gradients to have the provided L2-norm", default=0) parser.add_argument("--target_update_frequency", type=int, help="number of steps between target network updates", default=10000) parser.add_argument( "--memory_capacity", type=int, help="max number of experiences to store in experience memory", default=1000000) parser.add_argument( "--batch_size", type=int, help="number of transitions sampled from memory during learning", default=32) # must set to custom in order to specify custom architecture parser.add_argument("--network_architecture", type=str, help="name of prespecified network architecture", choices=("deepmind_nips", "deepmind_nature, custom"), default="deepmind_nature") parser.add_argument("--recording_frequency", type=int, help="number of steps before tensorboard recording", default=50000) parser.add_argument("--saving_threshold", type=int, help="min score threshold for saving model.", default=0) parser.add_argument("--parallel", help="parallelize acting and learning", action='store_true') parser.add_argument( "--double_dqn", help="use double q-learning algorithm in error target calculation", action='store_true') args = parser.parse_args() if args.network_architecture == 'deepmind_nature': args.conv_kernel_shapes = [[8, 8, 4, 32], [4, 4, 32, 64], [3, 3, 64, 64]] args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1], [1, 1, 1, 1]] args.dense_layer_shapes = [[3136, 512]] elif args.network_architecture == 'deepmind_nips': args.conv_kernel_shapes = [[8, 8, 4, 16], [4, 4, 16, 32]] args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1]] args.dense_layer_shapes = [[2592, 256]] if not args.watch: train_stats = RecordStats(args, False) test_stats = RecordStats(args, True) training_emulator = AtariEmulator(args) testing_emulator = AtariEmulator(args) num_actions = len(training_emulator.get_possible_actions()) experience_memory = ExperienceMemory(args, num_actions) q_network = None agent = None if args.parallel: q_network = ParallelQNetwork(args, num_actions) agent = ParallelDQNAgent(args, q_network, training_emulator, experience_memory, num_actions, train_stats) else: q_network = QNetwork(args, num_actions) agent = DQNAgent(args, q_network, training_emulator, experience_memory, num_actions, train_stats) experiment.run_experiment(args, agent, testing_emulator, test_stats) else: testing_emulator = AtariEmulator(args) num_actions = len(testing_emulator.get_possible_actions()) q_network = QNetwork(args, num_actions) agent = DQNAgent(args, q_network, None, None, num_actions, None) experiment.evaluate_agent(args, agent, testing_emulator, None)
class Agent: def __init__(self, env, state_space, action_space, device, learning_rate, buffer_size, \ batch_size, gamma, in_channels, train_freq = 4, target_update_freq=1e4, is_ddqn = False): self.env = env self.state_space = state_space self.action_space = action_space self.QNetwork_local = QNetwork(in_channels, self.state_space, self.action_space.n, device = device).to(device) self.QNetwork_local.init_weights() self.QNetwork_target = QNetwork(in_channels, self.state_space,self.action_space.n , device = device).to(device) self.QNetwork_target.load_state_dict(self.QNetwork_local.state_dict()) self.optimizer = torch.optim.RMSprop(self.QNetwork_local.parameters(), lr=learning_rate, alpha=0.95, eps=0.01, centered=True) self.criterion = torch.nn.MSELoss() self.memory = ReplayBuffer(capacity=int(buffer_size), batch_size=batch_size) self.step_count = 0. self.batch_size = batch_size self.gamma = gamma self.device = device self.buffer_size = buffer_size self.num_train_updates = 0 self.train_freq = train_freq self.target_update_freq = target_update_freq self.is_ddqn = is_ddqn print('Agent initialized with memory size:{}'.format(buffer_size)) def act(self, state, epsilon): if random.random() > epsilon: #switch to evaluation mode, evaluate state, switch back to train mode self.QNetwork_local.eval() if torch.no_grad(): actions = self.QNetwork_local(state) self.QNetwork_local.train() actions_np = actions.data.cpu().numpy()[0] best_action_idx = int(np.argmax(actions_np)) return best_action_idx else: rand_action = self.action_space.sample() return rand_action def step(self, state, action, reward, next_state , done, add_memory=True): #TODO calculate priority here? if add_memory: priority = 1. reward_clip = np.sign(reward) self.memory.add(state=state, action=action, next_state=next_state, reward=reward_clip, done=done, priority=priority) self.step_count = (self.step_count+1) % self.train_freq #self.update_rate #if self.step_count == 0 and len(self.memory) >= self.batch_size: self.network_is_updated = False if self.step_count == 0 and len(self.memory) == self.buffer_size: samples = self.memory.random_sample(self.device) self.learn(samples) self.num_train_updates +=1 self.network_is_updated = True def learn(self, samples): states, actions, rewards, next_states, dones = samples if self.is_ddqn is True: # DDQN: find max action using local network & gather the values of actions from target network next_actions = torch.argmax(self.QNetwork_local(next_states).detach(), dim=1).unsqueeze(1) q_target_next = self.QNetwork_target(next_states).gather(1,next_actions) else: # DQN: find the max action from target network q_target_next = self.QNetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # expected actions q_local_current = self.QNetwork_local(states).gather(1,actions) self.optimizer.zero_grad() #cleans up previous values # TD Error TD_target = rewards + (self.gamma*q_target_next * (1-dones)) TD_error = self.criterion(q_local_current, TD_target) TD_error.backward() torch.nn.utils.clip_grad_norm_(self.QNetwork_local.parameters(), 5.) self.optimizer.step() if (self.num_train_updates/self.train_freq) % self.target_update_freq == 0: self.QNetwork_target.load_state_dict(self.QNetwork_local.state_dict())
index = x.index(1) y = [0 for _ in range(index)] + [1 for _ in range(len(x) - index)] ys.append(y) if num_batches > batch_size: remainder = num_batches % batch_size while remainder != 0: xs.append([0 for _ in range(seq_length)]) ys.append([0 for _ in range(seq_length)]) remainder -= 1 xs = torch.Tensor(xs).unsqueeze(2).to(device) ys = torch.Tensor(ys).unsqueeze(2).to(device) print('x: {}'.format(xs[:, :, 0])) print('y: {}'.format(ys[:, :, 0])) return xs, ys q = QNetwork(input_size, hidden_size, seq_length).to(device) p = PNetwork(input_size, hidden_size, seq_length).to(device) params = list(q.parameters()) + list(p.parameters()) optimizer = torch.optim.Adagrad(params, lr=lr) def train(xs, ys): q.train() p.train() train_loss = 0 for epoch in range(num_epochs): for b in range(0, num_batches, batch_size): z = q.sample(xs[b:b+batch_size], ys[b:b+batch_size], 0) z = z.detach() # print(z.size())
def deep_qlearning(env, nframes, discount_factor, N, C, mini_batch_size, replay_start_size, sgd_update_frequency, initial_exploration, final_exploration, final_exploration_frame, lr, alpha, m): """ Input: - env: environment - nframes: # of frames to train on - discount_factor (gamma): how much to discount future rewards - N: replay memory size - C: number of steps before updating Q target network - mini_batch_size: mini batch size - replay_start_size: minimum size of replay memory before learning starts - sgd_update_frequency: number of action selections in between consecutive mini batch SGD updates - initial_exploration: initial epsilon value - final_exploration: final epsilon value - final_exploration_frame: number of frames over which the epsilon is annealed to its final value - lr: learning rate used by RMSprop - alpha: alpha value used by RMSprop - m: number of consecutive frames to stack for input to Q network Output: - Q: trained Q-network """ n_actions = env.action_space.n Q = QNetwork(n_actions) Q_target = deepcopy(Q) Q_target.eval() transform = T.Compose([T.ToTensor()]) optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha) criterion = nn.MSELoss() D = deque(maxlen=N) # replay memory last_Q_target_update = 0 frames_count = 0 last_sgd_update = 0 episodes_count = 0 episode_rewards = [] while True: frame_sequence = initialize_frame_sequence(env, m) state = transform(np.stack(frame_sequence, axis=2)) episode_reward = 0 done = False while not done: epsilon = annealed_epsilon(initial_exploration, final_exploration, final_exploration_frame, frames_count) action = get_epsilon_greedy_action(Q, state.unsqueeze(0), epsilon, n_actions) frame, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward]) episode_reward += reward.item() if done: next_state = None episode_rewards.append(episode_reward) else: frame_sequence.append(preprocess_frame(frame)) next_state = transform(np.stack(frame_sequence, axis=2)) D.append((state, action, reward, next_state)) state = next_state if len(D) < replay_start_size: continue last_sgd_update += 1 if last_sgd_update < sgd_update_frequency: continue last_sgd_update = 0 sgd_update(Q, Q_target, D, mini_batch_size, discount_factor, optimizer, criterion) last_Q_target_update += 1 frames_count += mini_batch_size if last_Q_target_update % C == 0: Q_target = deepcopy(Q) Q_target.eval() if frames_count >= nframes: return Q, episode_rewards episodes_count += 1 if episodes_count % 100 == 0: save_stuff(Q, episode_rewards) print(f'episodes completed = {episodes_count},', f'frames processed = {frames_count}')
from agents.deep_q import DeepQLearning from environments.snake import SnakeVisual from q_network import QNetwork env = SnakeVisual(grid_size=[8, 8], render=True, render_freq=1) actions = env.valid_actions() size = np.shape(env.reset().state) nn = ks.models.Sequential() nn.add(ks.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='sigmoid', input_shape=size)) nn.add(ks.layers.Conv2D(filters=24, kernel_size=(3, 3), activation='sigmoid')) nn.add(ks.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='sigmoid')) nn.add(ks.layers.Flatten()) nn.add(ks.layers.Dense(units=16, activation='sigmoid')) nn.add(ks.layers.Dense(units=3, activation='linear')) nn.compile(optimizer=ks.optimizers.Adam(lr=0.0001), loss='mse') print(nn.summary()) def normalize_state(s): return np.reshape(s.state, newshape=(1,) + size) dqn = QNetwork(nn, actions, normalize_state) dql = DeepQLearning(env, dqn) q = dql.learn()
class Agent: def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, learning_rate=0.1, discount_rate=0.99, eps_decay=0.9, tau=0.1, steps_per_update=4): self.device = device self.state_size = state_size self.action_size = action_size self.q_network_control = QNetwork(state_size, action_size).to(device) self.q_network_target = QNetwork(state_size, action_size).to(device) self.optimizer = torch.optim.Adam(self.q_network_control.parameters(), lr=learning_rate) self.batch_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size) self.discount_rate = discount_rate self.eps = 1.0 self.eps_decay = eps_decay self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update def policy(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) return self.epsilon_greedy_policy(self.eps, state) def epsilon_greedy_policy(self, eps, state): self.q_network_control.eval() with torch.no_grad(): action_values = self.q_network_control(state) self.q_network_control.train() if random.random() > eps: greedy_choice = np.argmax(action_values.cpu().data.numpy()) return greedy_choice else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): p = self.calculate_p(state, action, reward, next_state, done) self.replay_buffer.add(state, action, reward, next_state, done, p) if self.step_count % self.steps_per_update == 0: self.learn() self.step_count += 1 def learn(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones, p = \ self.replay_buffer.sample(self.batch_size) error = self.bellman_eqn_error(states, actions, rewards, next_states, dones) importance_scaling = (self.replay_buffer.buffer_size * p)**-1 loss = (importance_scaling * (error**2)).sum() / self.batch_size self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_target() def bellman_eqn_error(self, states, actions, rewards, next_states, dones): """Double DQN error - use the control network to get the best action and apply the target network to it to get the target reward which is used for the bellman eqn error. """ self.q_network_control.eval() with torch.no_grad(): a_max = self.q_network_control(next_states).argmax(1).unsqueeze(1) target_action_values = self.q_network_target(next_states).gather( 1, a_max) target_rewards = rewards + self.discount_rate * (1 - dones) \ * target_action_values self.q_network_control.train() current_rewards = self.q_network_control(states).gather(1, actions) error = current_rewards - target_rewards return error def calculate_p(self, state, action, reward, next_state, done): next_state = torch.from_numpy(next_state[np.newaxis, :]).float().to( self.device) state = torch.from_numpy(state[np.newaxis, :]).float().to(self.device) action = torch.from_numpy(np.array([[action]])).long().to(self.device) reward = torch.from_numpy(np.array([reward])).float().to(self.device) done = torch.from_numpy(np.array([[done]], dtype=np.uint8)).float().to( self.device) return abs( self.bellman_eqn_error(state, action, reward, next_state, done)) + 1e-3 def update_target(self): for target_param, control_param in zip( self.q_network_target.parameters(), self.q_network_control.parameters()): target_param.data.copy_(self.tau * control_param.data + (1.0 - self.tau) * target_param.data) def end_of_episode(self): self.eps *= self.eps_decay self.step_count = 0 def save(self, path): torch.save(self.q_network_control.state_dict(), path) def restore(self, path): self.q_network_control.load_state_dict(torch.load(path))
if __name__ == '__main__': import keras as ks import numpy as np from agents.deep_q import DeepQLearning from environments.cartpole import CartPole from q_network import QNetwork nn = ks.models.Sequential() nn.add(ks.layers.Dense(32, activation='sigmoid', input_shape=(4, ))) nn.add(ks.layers.Dense(32, activation='sigmoid')) nn.add(ks.layers.Dense(2, activation='linear')) nn.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse') env = CartPole(render=True) actions = env.valid_actions() dqn = QNetwork(nn, actions, lambda x: np.reshape(x.state, newshape=(1, 4))) dql = DeepQLearning(env, dqn) q = dql.learn()
def main(): """ Main function for training the DQN network in learning how to accomplish autonomous landing. """ # ATTENTION: If you want to restore files from a previous simulation you # must pass valid values for these variables: policy_weights_path = '/home/pulver/Desktop/DQN-single/episode_36000/policy/policy_checkpoint.ckp' ground_list = [ "asphalt11_small", "asphalt12_small", "asphalt13_small", "brick11_small", "brick12_small", "brick13_small", "grass11_small", "grass12_small", "grass13_small", "pavement11_small", "pavement12_small", "pavement13_small", "sand11_small", "sand12_small", "sand13_small", "snow11_small", "snow12_small", "snow13_small", "soil11_small", "soil12_small", "soil13_small" ] summary_folder = "" # if empty the summary is written in ./log/ + current time start_episode = 0 # change to last episode number frame_counter = 0 # change to last number of frames screen_width = 84 # original is 160 screen_height = 84 # original is 210 images_stack_size = 4 # Use only the first 5 actions for this simulation # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (land), # 6 (ascend), 7 (descend), 8 (rotate_left), 9 (rotate_right) tot_actions = 6 batch_size = 32 # size of the experience batch tot_steps = 1000 # finite-horizont simulation # 10x10^6 high number since training can be stopped at every time tot_episodes = 50 steps_per_episodes = 20 # expressed in number step noop_time = 2.0 # pause in seconds between actions num_ground_plane = 21 actual_ground_index = 0 # episode_per_ground specify the number of episodes with the same ground # plane ground_counter = 1 episode_per_ground = 10 timer_total_start = time.time() rospy.init_node("DeepReinforcedLanding") rospy.loginfo("----- Deep Reinforced Landing Node -----") rospy.Subscriber( "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30) # Store the last 30 messages before discarding them r = rospy.Rate(10) # 10hz # Init session and networks sess = tf.Session() if (summary_folder == ""): tf_summary_writer = tf.summary.FileWriter( './log/' + str(datetime.datetime.now().time()), sess.graph) else: tf_summary_writer = tf.summary.FileWriter(summary_folder, sess.graph) policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=(screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net") init = tf.global_variables_initializer() sess.run(init) # Load Neural Networks weights from memory if a valid checkpoint path is passed # if(os.path.isfile(policy_weights_path) == True and # os.path.isfile(target_weights_path) == True): try: print("Loading weights from memory...") policy_network.load_weights(policy_weights_path) except: print("Error while loading the weights!") return # start here for episode in range(start_episode, tot_episodes): # Reset the ground in the environment every 50 episodes (or # episode_per_ground) ground_index = episode / episode_per_ground if ground_index != actual_ground_index or (episode == start_episode): clean_world(ground_list) generate_new_world(ground_list) actual_ground_index = ground_index timer_start = time.time() actual_time = rospy.get_rostime() rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0 cumulated_reward = 0 print("") print("Episode: " + str(episode)) # 1-Accumulate the first state reset_pose() print "Reset pose!" send_action("stop") rospy.sleep(1.0) state = _last_image # create a stack of X images image_t = np.stack([state] * images_stack_size, axis=2) frame_episode = 0 for step in range(tot_steps): # 2- Get the action following epsilon-greedy or through policy network. # With probability epsilon take random action otherwise it takes # an action from the policy network. # Take a random action # Here image_t is always ready and it can be given as input to # the network action_distribution = policy_network.return_action_distribution( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False) action = np.argmax(action_distribution) action = convert_action_int_to_str(action) # action = get_random_action() send_action(action) rospy.sleep(noop_time) # 3- Get the state_t1 repeating the action obtained at step-2 and add everything # in the replay buffer. Then set state_t = state_t1 # get_image() image_t1 = _last_image # If the action taken is to land and the UAV is inside the landing # BB, done will be calculated accordingly done_reward = get_done_reward() reward = done_reward.reward done = done_reward.done image_t1 = np.expand_dims(image_t1, 2) # stack the images image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2) image_t = image_t1 cumulated_reward += reward send_action("stop") frame_episode = frame_episode + 1 # get_relative_pose(ground_index, episode, step, reward) # At every step check if more than 30 seconds from the start passed. # In that case, set done = True and end the episode timer_stop = time.time() # Stop the episode if the number of frame is more than a threshold if frame_episode >= steps_per_episodes: done = True # if timer_stop - timer_start >= 30.0: # maximum time allowed # #cumulated_reward += -1 # done = True # When the episode is done if done: timer_stop = time.time() actual_time = rospy.get_rostime() rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0 rospy_time_elapsed = rospy_stop_time - rospy_start_time print("Tot Frame counter: " + str(frame_counter)) print("Time episode: " + str(timer_stop - timer_start) + " seconds") print("Ros time episode: " + str(rospy_time_elapsed) + " seconds") if cumulated_reward >= 0: rospy.logwarn("Positive reward obtained!") print("Cumulated reward: " + str(cumulated_reward)) print("Episode finished after {} timesteps".format(step + 1)) sys.stdout.flush() # time.sleep(5) with open( "./results_" + str(datetime.date.today().strftime("%m%d%Y")) + ".csv", "a") as myfile: boolean_reward = 0 if (cumulated_reward > 0): boolean_reward = 1 ground_list = "mixed" string_to_add = "mixed" + "," + str(episode) + "," + str( step) + "," + str(cumulated_reward) + "," + str( boolean_reward) + '\n' myfile.write(string_to_add) break break
def main(): """ Main function for training the DQN network in learning how to accomplish autonomous landing. """ # ground_list = ["water1", # "water2", # "water3", # "water4", # "water5", # "water6", # "water7", # "water8", # "water9", # "water10"] # ground_list = ["water11", "water12", "water13", # "water14", "water15", "water16", # "water17", "water18", "water19"] ground_list = [ "pavement11", "pavement12", "pavement5", "pavement7", "pavement14", "pavement15", "pavement16" ] # NOTE: the ckp path must be modified in the code below ckp_list = ["120750"] # ATTENTION: If you want to restore files from a previous simulation you # must pass valid values for these variables: # policy_weights_path = # '/home/pulver/Desktop/marine_descending/checkpoint/episode_94500/policy/policy_checkpoint.ckp' summary_folder = "" # if empty the summary is written in ./log/ + current time start_episode = 0 # change to last episode number frame_counter = 0 # change to last number of frames screen_width = 84 # original is 160 screen_height = 84 # original is 210 images_stack_size = 4 # Use only the first 5 actions for this simulation # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend'] # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend) tot_actions = 6 batch_size = 32 # size of the experience batch tot_steps = 1000 # finite-horizont simulation # 10x10^6 high number since training can be stopped at every time steps_per_episodes = 40 # expressed in number step noop_time = 2.0 # pause in seconds between actions num_ground_plane = len(ground_list) actual_ground_index = 0 # episode_per_ground specify the number of episodes with the same ground # plane episode_per_ground = 10 tot_episodes = episode_per_ground * num_ground_plane accuracy_ground = 0.0 timer_total_start = time.time() rospy.init_node("DRLTestNode") rospy.loginfo("----- DRL Test Node -----") # Create a subscriber fot the greyscale image # rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback)#TODO # restore default rospy.Subscriber( "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30) # Store the last 30 messages before discarding them # Create a publisher for changing light position # light_pub = rospy.Publisher('/gazebo/default/light/modify', Light) images_stack_size = 4 # r = rospy.Rate(10) # 10hz # Init session and networks sess = tf.Session() policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=(screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net") init = tf.global_variables_initializer() sess.run(init) for i in range(len(ckp_list)): ckp = ckp_list[i] policy_weights_path = '/home/pulver/Desktop/sim11/episode_' + \ str(ckp) + '/policy/policy_checkpoint.ckp' if (summary_folder == ""): tf_summary_writer = tf.summary.FileWriter( './' + str(ckp) + '/log/' + str(datetime.datetime.now().time()), sess.graph) else: tf_summary_writer = tf.summary.FileWriter(summary_folder, sess.graph) if (policy_weights_path != ""): print("Loading weights from memory: " + str(policy_weights_path)) policy_network.load_weights(policy_weights_path) else: print("The networks path are empty. Choose appropriate weights!") sys.exit() # start here for episode in range(start_episode, tot_episodes): # Reset the ground in the environment every episode_per_ground ground_index = episode / episode_per_ground print "Current ground index: " + str(ground_index) if episode % ( episode_per_ground) == 0 and episode != start_episode: with open( "./" + str(ckp) + "/accuracy_ckp" + str(ckp) + ".csv", "a") as myfile: accuracy_ground = accuracy_ground / \ (episode_per_ground * 1.0) print "Final accuracy value: " + str(accuracy_ground) string_to_add = ground_list[ground_index - 1] + \ "," + str(accuracy_ground) + "\n" myfile.write(string_to_add) if ground_index != actual_ground_index or (episode == start_episode): accuracy_ground = 0.0 clean_world(ground_list) generate_new_world(ground_index, ground_list) # Reset the light every 10 episodes # if (episode % 10 == 0): # # Change the position of the three point lights # for i in range(0, 3): # model_to_change = "user_point_light_" + str(i) # x = STDrandom.uniform(-5.0, 5.0) # y = STDrandom.uniform(-5.0, 5.0) # z = STDrandom.uniform(1.0, 10.0) # # os.system("rosrun gazebo_ros spawn_model -file ~/.gazebo/models/" + model_to_change + # # "/model.sdf -sdf -model " + model_to_change + "_plane -x " + x + " -y " + y + " -z " + z + "") # light_msg = Light() # light_msg.pose.x = x # light_msg.pose.y = y # light_msg.pose.z = z # light_msg.name = model_to_change # light_pub.publish(light_msg) # rospy.loginfo("Lights have been resetted!") actual_ground_index = ground_index timer_start = time.time() actual_time = rospy.get_rostime() rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0 frame_episode = 0 cumulated_reward = 0 epsilon_used = 0 send_action("takeoff") print("") print("Episode: " + str(episode)) # 0-Reset USV's position os.system( "rosservice call /gazebo/set_model_state '{model_state: { model_name: usv_model, pose: { position: { x: 0.0, y: 0.0 ,z: 5.0 }, orientation: {x: 0, y: 0.0, z: 0, w: 0.0 } }, twist: { linear: {x: 0.0 , y: 0 ,z: 0 } , angular: { x: 0.0 , y: 0 , z: 0.0 } } , reference_frame: world } }'" ) # 1-Accumulate the first state reset_pose() print "Reset pose!" send_action("stop") rospy.sleep(1.0) # get_image() state = _last_image # create a stack of X images image_t = np.stack([state] * images_stack_size, axis=2) for step in range(tot_steps): # 2- Get the action following epsilon-greedy or through policy network. # Here image_t is always ready and it can be given as input to # the network action_distribution = policy_network.return_action_distribution( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False) # print "" # print action_distribution action = np.argmax(action_distribution) action = convert_action_int_to_str(action) # print action send_action(action) rospy.sleep(noop_time) # 3- Get the state_t1 repeating the action obtained at step-2 and add everything # in the replay buffer. Then set state_t = state_t1 # get_image() image_t1 = _last_image send_action("stop") # If the action taken is to land and the UAV is inside the landing # BB, done will be calculated accordingly done_reward = get_done_reward() reward = done_reward.reward done = done_reward.done image_t1 = np.expand_dims(image_t1, 2) # stack the images image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2) frame_counter += 1 # To call every time a frame is obtained frame_episode += 1 image_t = image_t1 get_relative_pose(ground_index, episode, step, reward, ground_list, ckp) cumulated_reward += reward # At every step check if more than 30 seconds from the start passed. # In that case, set done = True and end the episode timer_stop = time.time() # Stop the episode if the number of frame is more than a # threshold if frame_episode >= steps_per_episodes: done = True # When the episode is done if done: timer_stop = time.time() actual_time = rospy.get_rostime() rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0 rospy_time_elapsed = rospy_stop_time - rospy_start_time print("Tot Frame counter: " + str(frame_counter)) print("Time episode: " + str(timer_stop - timer_start) + " seconds") print("Ros time episode: " + str(rospy_time_elapsed) + " seconds") if cumulated_reward >= 0: rospy.logwarn("Positive reward obtained!") print("Cumulated reward: " + str(cumulated_reward)) print("Episode finished after {} timesteps".format(step + 1)) sys.stdout.flush() with open( "./" + str(ckp) + "/results_ckp" + str(ckp) + "_" + str(datetime.date.today().strftime("%m%d%Y")) + ".csv", "a") as myfile: boolean_reward = 0 if (cumulated_reward > 0): boolean_reward = 1 accuracy_ground += 1.0 print "Current accuracy value: " + str(accuracy_ground) string_to_add = ground_list[ground_index] + "," + str( episode) + "," + str(step) + "," + str( cumulated_reward) + "," + str( boolean_reward) + '\n' myfile.write(string_to_add) if episode == tot_episodes - 1: with open( "./" + str(ckp) + "/accuracy_ckp" + str(ckp) + ".csv", "a") as myfile: accuracy_ground = accuracy_ground / \ (episode_per_ground * 1.0) print "Final accuracy value: " + str( accuracy_ground) string_to_add = ground_list[actual_ground_index] + \ "," + str(accuracy_ground) + "\n" myfile.write(string_to_add) break
class DQN: def __init__(self, config, game, directory, callback=None, summary_writer=None): self.game = game self.actions = game.get_available_actions() self.feedback_size = game.get_feedback_size() self.callback = callback self.summary_writer = summary_writer self.config = config self.batch_size = config['batch_size'] self.n_episode = config['num_episode'] self.capacity = config['capacity'] self.epsilon_decay = config['epsilon_decay'] self.epsilon_min = config['epsilon_min'] self.num_frames = config['num_frames'] self.num_nullops = config['num_nullops'] self.time_between_two_copies = config['time_between_two_copies'] self.input_scale = config['input_scale'] self.update_interval = config['update_interval'] self.directory = directory self._init_modules() def _init_modules(self): # Replay memory self.replay_memory = ReplayMemory(history_len=self.num_frames, capacity=self.capacity, batch_size=self.batch_size, input_scale=self.input_scale) input_shape = self.feedback_size + (self.num_frames, ) # Q-network self.q_network = QNetwork(input_shape=input_shape, n_outputs=len(self.actions), network_type=self.config['network_type'], scope='q_network') # Target network self.target_network = QNetwork( input_shape=input_shape, n_outputs=len(self.actions), network_type=self.config['network_type'], scope='target_network') # Optimizer self.optimizer = Optimizer(config=self.config, feedback_size=self.feedback_size, q_network=self.q_network, target_network=self.target_network, replay_memory=self.replay_memory) # Ops for updating target network self.clone_op = self.target_network.get_clone_op(self.q_network) # For tensorboard self.t_score = tf.placeholder(dtype=tf.float32, shape=[], name='new_score') tf.summary.scalar("score", self.t_score, collections=['dqn']) self.summary_op = tf.summary.merge_all('dqn') def set_summary_writer(self, summary_writer=None): self.summary_writer = summary_writer self.optimizer.set_summary_writer(summary_writer) def choose_action(self, sess, state, epsilon_greedy): if numpy.random.binomial(1, epsilon_greedy) == 1: action = random.choice(self.actions) else: x = numpy.asarray(numpy.expand_dims(state, axis=0) / self.input_scale, dtype=numpy.float32) action = self.q_network.get_q_action(sess, x)[0] return action def play(self, action): r, new_state, termination = self.game.play_action(action) return r, new_state, termination def update_target_network(self, sess): sess.run(self.clone_op) def train(self, sess, saver=None): num_of_trials = -1 for episode in range(self.n_episode): self.game.reset() frame = self.game.get_current_feedback() for _ in range(self.num_nullops): r, new_frame, termination = self.play(action=0) self.replay_memory.add(frame, 0, r, termination) frame = new_frame for _ in range(self.config['T']): num_of_trials += 1 epsilon_greedy = self.epsilon_min + \ max(self.epsilon_decay - num_of_trials, 0) / \ self.epsilon_decay * (1 - self.epsilon_min) print("epi {}, frame {}k: reward {}, eps {}".format( episode, int(num_of_trials / 1000), self.game.get_total_reward(), epsilon_greedy)) if num_of_trials % self.update_interval == 0: self.optimizer.train_one_step(sess, num_of_trials, self.batch_size) state = self.replay_memory.phi(frame) action = self.choose_action(sess, state, epsilon_greedy) r, new_frame, termination = self.play(action) self.replay_memory.add(frame, action, r, termination) frame = new_frame if num_of_trials % self.time_between_two_copies == 0: self.update_target_network(sess) self.save(sess, saver) if self.callback: self.callback() if termination: score = self.game.get_total_reward() summary_str = sess.run(self.summary_op, feed_dict={self.t_score: score}) self.summary_writer.add_summary(summary_str, num_of_trials) self.summary_writer.flush() break def evaluate(self, sess): for episode in range(self.n_episode): self.game.reset() frame = self.game.get_current_feedback() for _ in range(self.num_nullops): r, new_frame, termination = self.play(action=0) self.replay_memory.add(frame, 0, r, termination) frame = new_frame for _ in range(self.config['T']): print("episode {}, total reward {}".format( episode, self.game.get_total_reward())) state = self.replay_memory.phi(frame) action = self.choose_action(sess, state, self.epsilon_min) r, new_frame, termination = self.play(action) self.replay_memory.add(frame, action, r, termination) frame = new_frame if self.callback: self.callback() if termination: break def save(self, sess, saver, model_name='model.ckpt'): if saver: try: checkpoint_path = os.path.join(self.directory, model_name) saver.save(sess, checkpoint_path) except: pass def load(self, sess, saver, model_name='model.ckpt'): if saver: try: checkpoint_path = os.path.join(self.directory, model_name) saver.restore(sess, checkpoint_path) except: pass
def main(): """ Main function for training the DQN network in learning how to accomplish autonomous landing. """ # ATTENTION: If you want to restore files from a previous simulation you # must pass valid values for these variables: policy_weights_path = '/home/pulver/Desktop/simulation_8/checkpoint/episode_20500/policy/policy_checkpoint.ckp' summary_folder = "" # if empty the summary is written in ./log/ + current time start_episode = 0 # change to last episode number frame_counter = 0 # change to last number of frames epsilon = 0.1 screen_width = 84 # original is 160 screen_height = 84 # original is 210 images_stack_size = 4 # Use only the first 5 actions for this simulation # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend'] # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend) tot_actions = 6 batch_size = 64 # size of the experience batch tot_steps = 1000 # finite-horizont simulation # 10x10^6 high number since training can be stopped at every time tot_episodes = 210 steps_per_episodes = 40 # expressed in number step noop_time = 2.0 # pause in seconds between actions num_ground_plane = 70 actual_ground_index = 0 # episode_per_ground specify the number of episodes with the same ground plane ground_counter = 1 episode_per_ground = 10 timer_total_start = time.time() rospy.init_node("DRLTestNode") rospy.loginfo("----- DRL Test Node -----") # Create a subscriber fot the greyscale image # rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback)#TODO # restore default rospy.Subscriber( "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback,queue_size=30) # Store the last 30 messages before discarding them images_stack_size = 4 r = rospy.Rate(10) # 10hz ground_list = ["asphalt11", "asphalt12", "asphalt13", "brick11", "brick12", "brick13", "grass11", "grass12", "grass13", "pavement11", "pavement12", "pavement13", "sand11", "sand12", "sand13", "snow11", "snow12", "snow13", "soil11", "soil12", "soil13"] # Init session and networks sess = tf.Session() if(summary_folder == ""): tf_summary_writer = tf.summary.FileWriter( './log/' + str(datetime.datetime.now().time()), sess.graph) else: tf_summary_writer = tf.summary.FileWriter( summary_folder, sess.graph) policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=( screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net") init = tf.global_variables_initializer() sess.run(init) if(policy_weights_path != ""): print("Loading weights from memory: " + str(policy_weights_path)) policy_network.load_weights(policy_weights_path) else: print("The networks path are empty. Choose appropriate weights!") sys.exit() # start here for episode in range(start_episode, tot_episodes): # Reset the ground in the environment every 50 episodes (or episode_per_ground) ground_index = episode / episode_per_ground if ground_index != actual_ground_index or (episode == 0): clean_world(ground_list) generate_new_world(ground_index, ground_list) actual_ground_index = ground_index timer_start = time.time() actual_time = rospy.get_rostime() rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0 frame_episode = 0 cumulated_reward = 0 epsilon_used = 0 send_action("takeoff") print("") print("Episode: " + str(episode)) # 1-Accumulate the first state reset_pose() print "Reset pose!" send_action("stop") rospy.sleep(1.0) # get_image() state = _last_image # create a stack of X images image_t = np.stack([state] * images_stack_size, axis=2) for step in range(tot_steps): # 2- Get the action following epsilon-greedy or through policy network. # Here image_t is always ready and it can be given as input to # the network epsilon_used_bool = False if(np.random.random_sample(1) < epsilon): epsilon_used += 1 action = get_random_action() epsilon_used_bool = True # Take the action from the policy network else: # Here image_t is always ready and it can be given as input to # the network action_distribution = policy_network.return_action_distribution( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False) action = np.argmax(action_distribution) action = convert_action_int_to_str(action) #print action send_action(action) rospy.sleep(noop_time) # 3- Get the state_t1 repeating the action obtained at step-2 and add everything # in the replay buffer. Then set state_t = state_t1 #get_image() image_t1 = _last_image send_action("stop") # If the action taken is to land and the UAV is inside the landing BB, done will be calculated accordingly done_reward = get_done_reward() reward = done_reward.reward done = done_reward.done if epsilon_used_bool: print " Step(" + str(step) + "), Action: " + action + ", Altitude: " + str(done_reward.z) + ", Reward: " + str(reward) else: print "#Step(" + str(step) + "), Action: " + action + ", Altitude: " + str(done_reward.z) + ", Reward: " + str(reward) # ['left', 'right', 'forward', 'backward', 'stop', 'land' print(" left:" + str(action_distribution[0][0][0]) + "; right:" + str(action_distribution[0][0][1]) + "; forward:" + str(action_distribution[0][0][2]) + "; backward:" + str(action_distribution[0][0][3]) + "; stop:" + str(action_distribution[0][0][4]) + "; descend:" + str(action_distribution[0][0][5])) image_t1 = np.expand_dims(image_t1, 2) # stack the images image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2) frame_counter += 1 # To call every time a frame is obtained frame_episode += 1 image_t = image_t1 get_relative_pose(ground_index, episode, step, reward) cumulated_reward += reward # At every step check if more than 30 seconds from the start passed. # In that case, set done = True and end the episode timer_stop = time.time() # Stop the episode if the number of frame is more than a threshold if frame_episode >= steps_per_episodes: done = True # When the episode is done if done: timer_stop = time.time() actual_time = rospy.get_rostime() rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0 rospy_time_elapsed = rospy_stop_time - rospy_start_time print("Tot Frame counter: " + str(frame_counter)) print("Time episode: " + str(timer_stop - timer_start) + " seconds") print("Ros time episode: " + str(rospy_time_elapsed) + " seconds") if cumulated_reward >= 0: rospy.logwarn("Positive reward obtained!") print("Cumulated reward: " + str(cumulated_reward)) print("Episode finished after {} timesteps".format(step + 1)) print("Epsilon used: " + str(epsilon_used) + " out of " + str(step + 1) + "(" + str(float((epsilon_used * 100.0) / (step + 1.0))) + "%)") sys.stdout.flush() time.sleep(5) break
class DeepQ_agent: """ Represents the DQN agent. """ def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95): """ Creates a DQN agent. :param env: game environment. :type env: Class Snake_Env(). :param hidden_units: number of neurons in each layer. :type hidden_units: tupple with dimension (1, 3). :param network_LR: learning rate of the action-value neural network. :type network_LR: float. :param batch_size: size of the minibatch taken from the replay buffer. :type batch_size: int. :param update_every: number of iterations for updating the target qnetwork. :type update_every: int :param gamma: discount factor. :type gamma: float. """ self.env = env self.BATCH_SIZE = batch_size self.GAMMA = gamma self.NETWORK_LR = network_LR self.MEMORY_CAPACITY = int(1e5) self.ACTION_SIZE = env.ACTION_SPACE self.HIDDEN_UNITS = hidden_units self.UPDATE_EVERY = update_every self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE, hidden_units = self.HIDDEN_UNITS, output_size = self.ACTION_SIZE, learning_rate = self.NETWORK_LR) self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) #Temp variable self.t = 0 def learn(self): """ Learn from memorized experience. """ if self.memory.__len__() > self.BATCH_SIZE: states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE) #Calculating action-values using local network target = self.qnetwork_local.predict(states, self.BATCH_SIZE) #Future action-values using target network target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE) #Future action-values using local network target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE) max_action_values = np.argmax(target_next, axis=1) #action selection for i in range(self.BATCH_SIZE): if dones[i]: target[i][actions[i]] = rewards[i] else: target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]] #action evaluation self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE) if self.t == self.UPDATE_EVERY: self.update_target_weights() self.t = 0 else: self.t += 1 def act(self, state, epsilon=0.0): """ Chooses an action using an epsilon-greedy policy. :param state: current state. :type state: NumPy array with dimension (1, 18). :param epsilon: epsilon used in epsilon-greedy policy. :type epsilon: float :return action: action chosen by the agent. :rtype: int """ state = state.reshape((1,)+state.shape) action_values = self.qnetwork_local.predict(state) #returns a vector of size = self.ACTION_SIZE if random() > epsilon: action = np.argmax(action_values) #choose best action - Exploitation else: action = randint(0, self.ACTION_SIZE-1) #choose random action - Exploration return action def add_experience(self, state, action, reward, next_state, done): """ Add experience to agent's memory. """ self.memory.add(state, action, reward, next_state, done) def update_target_weights(self): """ Updates values of the Target network. """ self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
def main(): """ Main function for training the DQN network in learning how to accomplish autonomous landing. """ # ATTENTION: If you want to restore files from a previous simulation you # must pass valid values for these variables: policy_weights_path_1 = '/home/pulver/Desktop/episode_113250/policy/policy_checkpoint.ckp' root_images = "/home/pulver/Desktop/network_testing/" + \ str(datetime.datetime.now().time()) + "/" # NOTE: openCV doesn't write in a folder that does not exist if not os.path.exists(root_images): os.makedirs(root_images) source = 3 # NOTE: 1 for real drone, 2 for gazebo, 3 for URL screen_width = 84 # original is 160 screen_height = 84 # original is 210 images_stack_size = 4 # Use only the first 5 actions for this simulation # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend'] # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend) tot_actions = 6 batch_size = 32 # size of the experience batch rospy.init_node("DRLTrainingNode") rospy.loginfo("----- DRL Training Node -----") # Create a subscriber fot the greyscale image # 1) Real drone if source == 1: rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback) elif source == 2: # 2) Gazebo rospy.Subscriber("/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30) # Store the last 30 messages elif source == 3: # 3) URL # video_stream_url = 'http://10.188.34.59:8080/videofeed' video_stream_url = "http://10.5.5.9:8080/live/amba.m3u8" bytes = '' else: print "Insert a correct source value (1 for real drone, 2 for gazebo, 3 for URL)" images_stack_size = 4 tot_steps = 3000000 # finite-horizont simulation r = rospy.Rate(1) # 10hz # Init session and networks sess = tf.Session() summary_folder = "" # if empty the summary is written in ./log/ + current time if(summary_folder == ""): tf_summary_writer = tf.summary.FileWriter( './log/' + str(datetime.datetime.now().time()), sess.graph) else: tf_summary_writer = tf.summary.FileWriter( summary_folder, sess.graph) policy_network_1 = QNetwork(sess, tot_actions=tot_actions, image_shape=( screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net") # Instructions for updating: Use `tf.global_variables_initializer init = tf.global_variables_initializer() sess.run(init) # Load Neural Networks weights from memory if a valid checkpoint path is # passed if(policy_weights_path_1 != ""): print("Loading weights 1 from memory...") policy_network_1.load_weights(policy_weights_path_1) else: print("The networks path are empty.") # sys.exit() if source == 3: state = acquire_frame(video_stream_url) time.sleep(1) state = _last_image # Save first image image_path = root_images + "image_0.png" cv2.imwrite(image_path, state) # 1 - Create a stack of X images image_t = np.stack([state] * images_stack_size, axis=2) matplotlib.interactive(True) # fig = plt.figure() f, (ax3, ax2) = plt.subplots(2, 1) f.tight_layout() # gs = gridspec.GridSpec(2, 2, width_ratios=[1, 1]) # gs.update(left=0.05, right=0.95, wspace=0.05, hspace=0) # fig.set_size_inches(8, 4) f.patch.set_facecolor('gray') for step in range(1, 100000000): ########################################## ## CAMERA ## ########################################## pad_size = 1 pad_value = 0 # print "Shape image_t:" + str(np.shape(image_t)) image = np.lib.pad(image_t[:, :, 0], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)) for d in range(1, images_stack_size): image_stack_padded = np.lib.pad( image_t[:, :, d], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)) image = np.append(image, image_stack_padded, axis=1) # print"shape: " + str(np.shape(image)) # Plot in the first row the camera images # ax1 = plt.subplot(2, 1, 1) # ax1 = plt.subplot(gs[0, :]) # but first clear the old one # ax1.clear() # ax1.axis("off") # specify greyscale instead BGR # ax1.imshow(image, cmap='gray') ######################################### # 2 - Forward in input to NN # action_distribution_1 = policy_network_1.return_action_distribution( # input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), # softmax=False) action_distribution_1, conv1, conv2, conv3 = policy_network_1.return_everything( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False) action = np.argmax(action_distribution_1) action = convert_action_int_to_str(action) print "######################" print "Action distribution: " + str(action_distribution_1) print "Action_1 selected: " + action # print "Shape action dis:" + str(np.shape(action_distribution_1)) # print "Shape conv1:" + str(np.shape(conv1)) # print "Shape conv2:" + str(np.shape(conv2)) # print "Shape conv3:" + str(np.shape(conv3)) ########################################## ## FILTERS ## ########################################## ax3.clear() ax3.axis("off") padding = np.ones((1, 21 * 16)) # 1 Conv layer ########################### conv1 = np.reshape(conv1, (21, 21, 32)) image_cv1_1 = np.reshape(conv1[:, :, 0:16], (21, 21 * 16), order='F') image_cv1_2 = np.reshape(conv1[:, :, 16:32], (21, 21 * 16), order='F') # image_cv1_1 = np.reshape(conv1[:, :, 0:8], (21, 21 * 8), order='F') # image_cv1_2 = np.reshape(conv1[:, :, 8:16], (21, 21 * 8), order='F') # image_cv1_3 = np.reshape(conv1[:, :, 16:24], (21, 21 * 8), order='F') # image_cv1_4 = np.reshape(conv1[:, :, 24:32], (21, 21 * 8), order='F') image_cv1 = np.concatenate( (padding, image_cv1_1, image_cv1_2), axis=0) # Save filters filter_path = root_images + "filters/step_" + \ str(step) + "/" if not os.path.exists(filter_path): os.makedirs(filter_path) filter_path = filter_path + "conv_1.jpg" I = image_cv1 I8 = (((I - I.min()) / (I.max() - I.min())) * 255.9).astype(np.uint8) image_cv1_resized = cv2.resize(I8, (84 * 4, 21 * 2), interpolation=cv2.INTER_AREA) img = Image.fromarray(image_cv1_resized) img.save(filter_path) # 2 Conv layer ########################### padding = np.zeros((1, 11 * 32)) conv2 = np.reshape(conv2, (11, 11, 64)) image_cv2_1 = np.reshape(conv2[:, :, 0:32], (11, 11 * 32), order='F') image_cv2_2 = np.reshape(conv2[:, :, 32:64], (11, 11 * 32), order='F') # image_cv2_3 = np.reshape(conv2[:, :, 16:24], (21, 21 * 8), order='F') # image_cv2_4 = np.reshape(conv2[:, :, 24:32], (21, 21 * 8), order='F') image_cv2 = np.concatenate( (padding, image_cv2_1, image_cv2_2), axis=0) # Save filters filter_path = root_images + "filters/step_" + \ str(step) + "/" if not os.path.exists(filter_path): os.makedirs(filter_path) filter_path = filter_path + "conv_2.jpg" I = image_cv2 I8 = (((I - I.min()) / (I.max() - I.min())) * 255.9).astype(np.uint8) image_cv2_resized = cv2.resize(I8, (84 * 4, 11 * 2), interpolation=cv2.INTER_AREA) img = Image.fromarray(image_cv2_resized) img.save(filter_path) # 3 Conv layer ########################### padding = np.ones((1, 11 * 32)) conv3 = np.reshape(conv3, (11, 11, 64)) image_cv3_1 = np.reshape(conv3[:, :, 0:32], (11, 11 * 32), order='F') image_cv3_2 = np.reshape(conv3[:, :, 32:64], (11, 11 * 32), order='F') # image_cv2_3 = np.reshape(conv2[:, :, 16:24], (21, 21 * 8), order='F') # image_cv2_4 = np.reshape(conv2[:, :, 24:32], (21, 21 * 8), order='F') image_cv3 = np.concatenate( (padding, image_cv3_1, image_cv3_2), axis=0) # Save filters filter_path = root_images + "filters/step_" + \ str(step) + "/" if not os.path.exists(filter_path): os.makedirs(filter_path) filter_path = filter_path + "conv_3.jpg" I = image_cv3 I8 = (((I - I.min()) / (I.max() - I.min())) * 255.9).astype(np.uint8) image_cv3_resized = cv2.resize(I8, (84 * 4, 11 * 2), interpolation=cv2.INTER_AREA) img = Image.fromarray(image_cv3_resized) img.save(filter_path) # Assemble final image #################### image_input = np.reshape(image_t, (84, 84 * 4), order='F') image_input_resized = cv2.resize(image, (84 * 4, 84), interpolation=cv2.INTER_AREA) final_image = np.concatenate( (image_input_resized, image_cv1_resized, image_cv2_resized, image_cv3_resized)) # Plot the image # ax3 = plt.subplot(gs[0, :]) # but first clear the old one # specify greyscale instead BGR ax3 = plt.subplot(2, 1, 1) ax3.imshow(final_image, cmap='gray') ########################################## ## HISTOGRAM ## ########################################## # print np.shape(action_distribution_1) # print len(action_distribution_1[0]) ind = np.arange(len(action_distribution_1[0])) # print ind width = 0.4 # fig, ax = plt.subplots() # ax2 = plt.subplot(gs[1, :]) ax2.clear() ax2 = plt.subplot(2, 1, 2) ax2.set_aspect(3) rects = ax2.bar(0.4 + ind, action_distribution_1[0], width=width, color='r', alpha=0.4) rects[np.argmax(action_distribution_1)].set_color('g') ax2.set_xticks(0.4 + ind + width / 2) ax2.set_xticklabels(("left", "right", "forward", "backward", "stop", "descend")) for i in ax2.xaxis.get_ticklabels(): i.set_color("white") ax2.set_ylim([0, 1]) # plt.xticks(fontsize=16) # plt.show() ########################################## # 3 - Send command and wait for translation raw_input("Press a button to acquire images...") # 4 - Acquire second frame and add to the stack if source == 3: global _last_image _last_image = acquire_frame(video_stream_url) time.sleep(1) image_t1 = _last_image # Save every one image image image_path = root_images + "image_" + str(step) + ".png" cv2.imwrite(image_path, image_t1) image_t1 = np.expand_dims(image_t1, 2) image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2) # 5 - Forward stack in input to the network image_t = image_t1
def main(): """ Main function for training the DQN network in learning how to accomplish autonomous landing. """ # ATTENTION: If you want to restore files from a previous simulation you # must pass valid values for these variables: policy_weights_path = '/home/pulver/Desktop/episode_32400/policy/policy_checkpoint.ckp' summary_folder = "" # if empty the summary is written in ./log/ + current time start_episode = 0 # change to last episode number frame_counter = 0 # change to last number of frames screen_width = 84 # original is 160 screen_height = 84 # original is 210 images_stack_size = 4 # Use only the first 5 actions for this simulation # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (land), # 6 (ascend), 7 (descend), 8 (rotate_left), 9 (rotate_right) tot_actions = 6 batch_size = 32 # size of the experience batch tot_steps = 1000 # finite-horizont simulation # 10x10^6 high number since training can be stopped at every time tot_episodes = 1050 steps_per_episodes = 20 # expressed in number step noop_time = 2.0 # pause in seconds between actions num_ground_plane = 21 episodes_per_ground_plane = tot_episodes / num_ground_plane actual_ground_index = 0 timer_total_start = time.time() rospy.init_node("DeepReinforcedLanding") rospy.loginfo("----- Deep Reinforced Landing Node -----") rospy.Subscriber( "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30) # Store the last 30 messages before discarding them r = rospy.Rate(10) # 10hz # Init session and networks sess = tf.Session() if(summary_folder == ""): tf_summary_writer = tf.summary.FileWriter( './log/' + str(datetime.datetime.now().time()), sess.graph) else: tf_summary_writer = tf.summary.FileWriter( summary_folder, sess.graph) policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=( screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net") init = tf.global_variables_initializer() sess.run(init) # Load Neural Networks weights from memory if a valid checkpoint path is passed # if(os.path.isfile(policy_weights_path) == True and # os.path.isfile(target_weights_path) == True): try: print("Loading weights from memory...") policy_network.load_weights(policy_weights_path) except: print("Error while loading the weights!") return print "Episode, step, action, reward, action_distribution" # start here for episode in range(start_episode, tot_episodes): # Reset the ground in the environment every 25k frames # ground_index = episode / 50 # if ground_index != actual_ground_index or episode == 0: # generate_new_world(ground_index) # actual_ground_index = ground_index timer_start = time.time() actual_time = rospy.get_rostime() rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0 cumulated_reward = 0 # 1-Accumulate the first state #reset_pose() send_action("stop") rospy.sleep(1.0) state = _last_image # create a stack of X images image_t = np.stack([state] * images_stack_size, axis=2) frame_episode = 0 pad_size = 3 pad_value = 255 for step in range(tot_steps): # 2- Get the action following epsilon-greedy or through policy network. # With probability epsilon take random action otherwise it takes # an action from the policy network. # Take a random action image_stack_padded = [np.lib.pad(image_t[:,:,0], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)), np.lib.pad(image_t[:,:,1], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)), np.lib.pad(image_t[:,:,2], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)), np.lib.pad(image_t[:,:,3], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value))] image_hstack = np.hstack(image_stack_padded) cv2.imwrite("/home/pulver/Desktop/print_network_action_distribution/img_ep"+str(episode)+"_step" + str(step) + ".png", image_hstack ) # Here image_t is always ready and it can be given as input to # the network action_distribution = policy_network.return_action_distribution( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False) action = np.argmax(action_distribution) action = convert_action_int_to_str(action) #action = get_random_action() send_action(action) rospy.sleep(noop_time) # 3- Get the state_t1 repeating the action obtained at step-2 and add everything # in the replay buffer. Then set state_t = state_t1 # get_image() image_t1 = _last_image # If the action taken is to land and the UAV is inside the landing # BB, done will be calculated accordingly done_reward = get_done_reward() reward = done_reward.reward done = done_reward.done image_t1 = np.expand_dims(image_t1, 2) # stack the images image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2) image_t = image_t1 cumulated_reward += reward send_action("stop") frame_episode = frame_episode + 1 #get_relative_pose(ground_index, episode, step, reward) # At every step check if more than 30 seconds from the start passed. # In that case, set done = True and end the episode timer_stop = time.time() print str(episode) + "," + str(step) + "," + action + "," + str(reward) + "," +str(action_distribution) # Stop the episode if the number of frame is more than a threshold if frame_episode >= steps_per_episodes: done = True # if timer_stop - timer_start >= 30.0: # maximum time allowed # #cumulated_reward += -1 # done = True # When the episode is done if done: break