Ejemplo n.º 1
0
    def build_main_networks(self):
        ##################################
        #### Build the learned critic ####
        ##################################
        self.critic = BuildQNetwork(self.state_placeholder, self.action_placeholder, scope='learner_critic_main')

        # Build the critic training function
        self.train_critic_one_step, self.projected_target_distribution = self.critic.generate_training_function(self.target_q_distribution_placeholder, self.target_bins_placeholder, self.importance_sampling_weights_placeholder)

        #################################
        #### Build the learned actor ####
        #################################
        self.actor = BuildActorNetwork(self.state_placeholder, scope='learner_actor_main')

        # Build the actor training function
        self.train_actor_one_step = self.actor.generate_training_function(self.dQ_dAction_placeholder)
Ejemplo n.º 2
0
 def build_target_networks(self):
     ###########################################
     #### Build the target actor and critic ####
     ###########################################
     self.target_critic = BuildQNetwork(self.state_placeholder,
                                        self.action_placeholder,
                                        scope='learner_critic_target')
     self.target_actor = BuildActorNetwork(self.state_placeholder,
                                           scope='learner_actor_target')
Ejemplo n.º 3
0
    def build_actor(self):
        # Generate the actor's policy neural network
        agent_name = 'agent_' + str(self.n_agent) # agent name 'agent_3', for example
        self.state_placeholder = tf.placeholder(dtype = tf.float32, shape = [None, Settings.OBSERVATION_SIZE], name = 'state_placeholder') # the * lets Settings.OBSERVATION_SIZE be not restricted to only a scalar

        #############################
        #### Generate this Actor ####
        #############################
        self.policy = BuildActorNetwork(self.state_placeholder, scope = agent_name)
Ejemplo n.º 4
0
class Learner:
    def __init__(self, sess, saver, replay_buffer, writer):
        print("Initialising learner...")

        # Saving items to the self. object for future use
        self.sess = sess
        self.saver = saver
        self.replay_buffer = replay_buffer
        self.writer = writer

        with tf.variable_scope("Preparing_placeholders"):
            # Defining placeholders for training
            #            self.state_placeholder                       = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.OBSERVATION_SIZE], name = "state_placeholder") # the '*' unpacks the OBSERVATION_SIZE list (incase it's pixels of higher dimension)
            #            self.action_placeholder                      = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.ACTION_SIZE], name = "action_placeholder") # placeholder for actions
            #            self.target_bins_placeholder                 = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.NUMBER_OF_BINS], name = "target_bins_placeholder") # Bin values of target network with Bellman update applied
            #            self.target_q_distribution_placeholder       = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.NUMBER_OF_BINS], name = "target_q_distribution_placeholder")  # Future q-distribution from target critic
            #            self.dQ_dAction_placeholder                  = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.ACTION_SIZE], name = "dQ_dAction_placeholder") # Gradient of critic predicted value with respect to input actions
            #            self.importance_sampling_weights_placeholder = tf.placeholder(dtype = tf.float32, shape =  Settings.MINI_BATCH_SIZE, name = "importance_sampling_weights_placeholder") # [PRIORITY_REPLAY_BUFFER only] Holds the weights that are used to remove bias from priority sampling

            self.state_placeholder = tf.placeholder(
                dtype=tf.float32,
                shape=[None, Settings.OBSERVATION_SIZE],
                name="state_placeholder"
            )  # the '*' unpacks the OBSERVATION_SIZE list (incase it's pixels of higher dimension)
            self.action_placeholder = tf.placeholder(
                dtype=tf.float32,
                shape=[None, Settings.ACTION_SIZE],
                name="action_placeholder")  # placeholder for actions
            self.target_bins_placeholder = tf.placeholder(
                dtype=tf.float32,
                shape=[None, Settings.NUMBER_OF_BINS],
                name="target_bins_placeholder"
            )  # Bin values of target network with Bellman update applied
            self.target_q_distribution_placeholder = tf.placeholder(
                dtype=tf.float32,
                shape=[None, Settings.NUMBER_OF_BINS],
                name="target_q_distribution_placeholder"
            )  # Future q-distribution from target critic
            self.dQ_dAction_placeholder = tf.placeholder(
                dtype=tf.float32,
                shape=[Settings.MINI_BATCH_SIZE, Settings.ACTION_SIZE],
                name="dQ_dAction_placeholder"
            )  # Gradient of critic predicted value with respect to input actions
            self.importance_sampling_weights_placeholder = tf.placeholder(
                dtype=tf.float32,
                shape=None,
                name="importance_sampling_weights_placeholder"
            )  # [PRIORITY_REPLAY_BUFFER only] Holds the weights that are used to remove bias from priority sampling

        # The reward options that the distributional critic predicts the liklihood of being in
        self.bins = np.linspace(Settings.MIN_V,
                                Settings.MAX_V,
                                Settings.NUMBER_OF_BINS,
                                dtype=np.float32)

        ######################################################
        ##### Build the networks and training operations #####
        ######################################################
        self.build_main_networks()
        self.build_target_networks()

        # Build the operation to update the target network parameters
        self.build_target_parameter_update_operations()

        # Create operstions for Tensorboard logging
        self.writer = writer
        self.create_summary_functions()

        print("Learner created!")

    def create_summary_functions(self):

        # Creates the operation that, when run, will log the appropriate data to tensorboard
        with tf.variable_scope("Logging_Learning"):
            # The critic loss during training is the only logged item
            self.iteration_loss_placeholder = tf.placeholder(tf.float32)
            self.iteration_loss_summary = tf.summary.scalar(
                "Loss", self.iteration_loss_placeholder)
            self.iteration_summary = tf.summary.merge(
                [self.iteration_loss_summary])

    def build_main_networks(self):
        ##################################
        #### Build the learned critic ####
        ##################################
        self.critic = BuildQNetwork(self.state_placeholder,
                                    self.action_placeholder,
                                    scope='learner_critic_main')

        # Build the critic training function
        self.train_critic_one_step, self.projected_target_distribution = self.critic.generate_training_function(
            self.target_q_distribution_placeholder,
            self.target_bins_placeholder,
            self.importance_sampling_weights_placeholder)

        #################################
        #### Build the learned actor ####
        #################################
        self.actor = BuildActorNetwork(self.state_placeholder,
                                       scope='learner_actor_main')

        # Build the actor training function
        self.train_actor_one_step = self.actor.generate_training_function(
            self.dQ_dAction_placeholder)

    def build_target_networks(self):
        ###########################################
        #### Build the target actor and critic ####
        ###########################################
        self.target_critic = BuildQNetwork(self.state_placeholder,
                                           self.action_placeholder,
                                           scope='learner_critic_target')
        self.target_actor = BuildActorNetwork(self.state_placeholder,
                                              scope='learner_actor_target')

    def build_target_parameter_update_operations(self):
        # Build operations that either
        # 1) initialize target networks to be identical to main networks 2) slowly
        # 2) slowly copy main network parameters to target networks according to Settings.TARGET_NETWORK_TAU
        main_parameters = self.actor.parameters + self.critic.parameters
        target_parameters = self.target_actor.parameters + self.target_critic.parameters

        # Build operation that fully copies the main network parameters to the targets [Option 1 above]
        initialize_target_network_parameters = []
        # Looping across all variables in the main critic and main actor
        for source_variable, destination_variable in zip(
                main_parameters, target_parameters):
            initialize_target_network_parameters.append(
                destination_variable.assign(source_variable))

        # Build operation that slowly updates target networks according to Settings.TARGET_NETWORK_TAU [Option 2 above]
        update_target_network_parameters = []
        # Looping across all variables in the main critic and main actor
        for source_variable, destination_variable in zip(
                main_parameters, target_parameters):
            # target = tau*main + (1 - tau)*target
            update_target_network_parameters.append(
                destination_variable.assign((
                    tf.multiply(source_variable, Settings.TARGET_NETWORK_TAU) +
                    tf.multiply(destination_variable,
                                1. - Settings.TARGET_NETWORK_TAU))))

        # Save both operations to self object for later use
        self.initialize_target_network_parameters = initialize_target_network_parameters
        self.update_target_network_parameters = update_target_network_parameters

    def generate_queue(self):
        # Generate the queues responsible for communicating with the learner
        self.agent_to_learner = multiprocessing.Queue(maxsize=1)
        self.learner_to_agent = multiprocessing.Queue(maxsize=1)

        return self.agent_to_learner, self.learner_to_agent

    def run(self, stop_run_flag, replay_buffer_dump_flag,
            starting_training_iteration):
        # Continuously train the actor and the critic, by applying stochastic gradient
        # descent to batches of data sampled from the replay buffer
        print("Starting to run learner at iteration %i" %
              starting_training_iteration)

        # Initializing the counter of training iterations
        self.total_training_iterations = starting_training_iteration

        # Starting time
        start_time = time.time()

        # Initialize the target networks to be identical to the main networks
        self.sess.run(self.initialize_target_network_parameters)

        # Setup priority replay buffer parameters, if used
        if Settings.PRIORITY_REPLAY_BUFFER:
            # When priority_beta = 1, the priority sampling bias is fully accounted for.
            # We slowly anneal priority_beta towards 1.0 over the course of training.
            # Lower beta allows the prioritized samples to be weighted unfairly,
            # but this can help training, at least initially.
            priority_beta = Settings.PRIORITY_BETA_START  # starting beta value
            beta_increment = (
                Settings.PRIORITY_BETA_END - Settings.PRIORITY_BETA_START
            ) / Settings.MAX_TRAINING_ITERATIONS  # to increment on each iteration
        else:
            # If we aren't using a priority buffer, set the importance sampled weights to ones for the entire run
            weights_batch = np.ones(shape=Settings.MINI_BATCH_SIZE)

        ###############################
        ##### Start Training Loop #####
        ###############################
        while self.total_training_iterations < Settings.MAX_TRAINING_ITERATIONS and not stop_run_flag.is_set(
        ):

            # Check if the agent wants some q-distributions calculated
            try:
                state_log, action_log, next_state_log, reward_log, done_log, gamma_log = self.agent_to_learner.get(
                    False)

                # Reshapping
                gamma_log = np.reshape(gamma_log, [-1, 1])

                # Get the online q-distribution
                critic_distribution = self.sess.run(
                    self.critic.q_distribution,
                    feed_dict={
                        self.state_placeholder: state_log,
                        self.action_placeholder: action_log
                    })  # [episode length, number of bins]

                # Clean next actions from the target actor
                clean_next_actions = self.sess.run(
                    self.target_actor.action_scaled,
                    {self.state_placeholder: next_state_log
                     })  # [episode length, num_actions]

                # Get the target q-distribution
                target_critic_distribution = self.sess.run(
                    self.target_critic.q_distribution,
                    feed_dict={
                        self.state_placeholder: state_log,
                        self.action_placeholder: clean_next_actions
                    })  # [episode length, number of bins]

                # Create batch of bins [see further description below]
                target_bins = np.repeat(
                    np.expand_dims(self.bins, axis=0), len(reward_log),
                    axis=0)  # [episode length, number_of_bins]
                target_bins[done_log, :] = 0.0
                target_bins = np.expand_dims(
                    reward_log, axis=1) + (target_bins * gamma_log)

                # Calculating the bellman distribution (r + gamma*target_q_distribution). The critic loss is with respect to this projection.
                projected_target_distribution = self.sess.run(
                    self.projected_target_distribution,
                    feed_dict={
                        self.target_q_distribution_placeholder:
                        target_critic_distribution,
                        self.target_bins_placeholder: target_bins
                    })

                # Calculating the loss at each timestep
                weights_batch = weights_batch = np.ones(shape=len(reward_log))
                loss_log = self.sess.run(
                    self.critic.loss,
                    feed_dict={
                        self.state_placeholder: state_log,
                        self.action_placeholder: action_log,
                        self.target_q_distribution_placeholder:
                        target_critic_distribution,
                        self.target_bins_placeholder: target_bins,
                        self.importance_sampling_weights_placeholder:
                        weights_batch
                    })

                # Send the results back to the agent
                self.learner_to_agent.put(
                    (critic_distribution, target_critic_distribution,
                     projected_target_distribution, loss_log))

            except queue.Empty:
                # If queue was empty, do nothing
                pass

            # If we don't have enough data yet to train OR we want to wait before we start to train
            if (self.replay_buffer.how_filled() < Settings.MINI_BATCH_SIZE
                ) or (self.replay_buffer.how_filled() <
                      Settings.REPLAY_BUFFER_START_TRAINING_FULLNESS):
                continue  # Skip this training iteration. Wait for more training data.

            # Sample a mini-batch of data from the replay_buffer
            if Settings.PRIORITY_REPLAY_BUFFER:
                sampled_batch = self.replay_buffer.sample(priority_beta)
                weights_batch = sampled_batch[
                    6]  # [priority-only data] used for removing bias in prioritized data
                index_batch = sampled_batch[
                    7]  # [priority-only data] used for updating priorities
            else:
                sampled_batch = self.replay_buffer.sample()

            # Unpack the training data
            states_batch = sampled_batch[0]
            actions_batch = sampled_batch[1]
            rewards_batch = sampled_batch[2]
            next_states_batch = sampled_batch[3]
            dones_batch = sampled_batch[4]
            gammas_batch = sampled_batch[5]

            ###################################
            ##### Prepare Critic Training #####
            ###################################
            # Get clean next actions by feeding the next states through the target actor
            clean_next_actions = self.sess.run(
                self.target_actor.action_scaled,
                {self.state_placeholder: next_states_batch
                 })  # [batch_size, num_actions]

            # Get the next q-distribution by passing the next states and clean next actions through the target critic
            target_critic_distribution = self.sess.run(
                self.target_critic.q_distribution, {
                    self.state_placeholder: next_states_batch,
                    self.action_placeholder: clean_next_actions
                })  # [batch_size, number_of_bins]

            # Create batch of bins
            target_bins = np.repeat(np.expand_dims(self.bins, axis=0),
                                    Settings.MINI_BATCH_SIZE,
                                    axis=0)  # [batch_size, number_of_bins]

            # If this data in the batch corresponds to the end of an episode (dones_batch[i] = True),
            # set all the bins to 0.0. This will eliminate the inclusion of the predicted future
            # reward when computing the bellman update (i.e., the predicted future rewards are only
            # the current reward, since we aren't continuing the episode any further).
            target_bins[dones_batch, :] = 0.0

            # Bellman projection. reward + gamma^N*bin -> The new
            # expected reward, according to the recently-received reward.
            # If the new reward is outside of the current bin, then we will
            # adjust the probability that is assigned to the bin.
            target_bins = np.expand_dims(rewards_batch,
                                         axis=1) + (target_bins * gammas_batch)

            #####################################
            ##### TRAIN THE CRITIC ONE STEP #####
            #####################################
            critic_loss, _ = self.sess.run(
                [self.critic.loss, self.train_critic_one_step], {
                    self.state_placeholder: states_batch,
                    self.action_placeholder: actions_batch,
                    self.target_q_distribution_placeholder:
                    target_critic_distribution,
                    self.target_bins_placeholder: target_bins,
                    self.importance_sampling_weights_placeholder: weights_batch
                })

            ##################################
            ##### Prepare Actor Training #####
            ##################################
            # Get clean actions that the main actor would have taken for this batch of states if there were no noise added
            clean_actions = self.sess.run(
                self.actor.action_scaled,
                {self.state_placeholder: states_batch})

            # Calculate the derivative of the main critic's q-value with respect to these actions
            dQ_dAction = self.sess.run(
                self.critic.dQ_dAction, {
                    self.state_placeholder: states_batch,
                    self.action_placeholder: clean_actions
                })  # also known as action gradients

            ####################################
            ##### TRAIN THE ACTOR ONE STEP #####
            ####################################
            self.sess.run(
                self.train_actor_one_step, {
                    self.state_placeholder: states_batch,
                    self.dQ_dAction_placeholder: dQ_dAction[0]
                })

            # If it's time to update the target networks
            if self.total_training_iterations % Settings.UPDATE_TARGET_NETWORKS_EVERY_NUM_ITERATIONS == 0:
                # Update target networks according to TAU!
                self.sess.run(self.update_target_network_parameters)

            # If we're using a priority buffer, tend to it now.
            if Settings.PRIORITY_REPLAY_BUFFER:
                # The priority replay buffer ranks the data according to how unexpected they were
                # An unexpected data point will have high loss. Now that we've just calculated the loss,
                # update the priorities in the replay buffer.
                self.replay_buffer.update_priorities(
                    index_batch,
                    (np.abs(critic_loss) + Settings.PRIORITY_EPSILON))

                # Increment priority beta value slightly closer towards 1.0
                priority_beta += beta_increment

                # If it's time to check if the prioritized replay buffer is overful
                if Settings.PRIORITY_REPLAY_BUFFER and (
                        self.total_training_iterations % Settings.
                        DUMP_PRIORITY_REPLAY_BUFFER_EVER_NUM_ITERATIONS == 0):
                    # If the buffer is overfilled
                    if (self.replay_buffer.how_filled() >
                            Settings.REPLAY_BUFFER_SIZE):
                        # Make the agents wait before adding any more data to the buffer
                        replay_buffer_dump_flag.clear()
                        # How overful is the buffer?
                        samples_to_remove = self.replay_buffer.how_filled(
                        ) - Settings.REPLAY_BUFFER_SIZE
                        # Remove the appropriate number of samples
                        self.replay_buffer.remove(samples_to_remove)
                        # Allow the agents to continue now that the buffer is ready
                        replay_buffer_dump_flag.set()

            # If it's time to log the training performance to TensorBoard
            if self.total_training_iterations % Settings.LOG_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS == 0:
                # Logging the mean critic loss across the batch
                summary = self.sess.run(self.iteration_summary,
                                        feed_dict={
                                            self.iteration_loss_placeholder:
                                            np.mean(critic_loss)
                                        })
                self.writer.add_summary(summary,
                                        self.total_training_iterations)

            # If it's time to save a checkpoint. Be it a regular checkpoint, the final planned iteration, or the final unplanned iteration
            if (self.total_training_iterations %
                    Settings.SAVE_CHECKPOINT_EVERY_NUM_ITERATIONS
                    == 0) or (self.total_training_iterations
                              == Settings.MAX_TRAINING_ITERATIONS
                              ) or stop_run_flag.is_set():
                # Save the state of all networks and note the training iteration
                self.saver.save(self.total_training_iterations,
                                self.state_placeholder,
                                self.actor.action_scaled)

                # Make the agents wait before adding any more data to the buffer
                replay_buffer_dump_flag.clear()
                # Save the replay buffer
                self.replay_buffer.save()
                # Allow the agents to continue now that the buffer is ready
                replay_buffer_dump_flag.set()

            # If it's time to print the training performance to the screen
            if self.total_training_iterations % Settings.DISPLAY_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS == 0:
                print(
                    "Trained actor and critic %i iterations in %.2f minutes, at %.3f s/iteration. Now at iteration %i."
                    % (Settings.
                       DISPLAY_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS,
                       (time.time() - start_time) / 60,
                       (time.time() - start_time) / Settings.
                       DISPLAY_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS,
                       self.total_training_iterations))
                start_time = time.time(
                )  # resetting the timer for the next PERFORMANCE_UPDATE_EVERY_NUM_ITERATIONS of iterations

            # Incrementing training iteration counter
            self.total_training_iterations += 1

        # If we are done training
        print("Learner finished after running " +
              str(self.total_training_iterations) + " training iterations!")

        # Flip the flag signalling all agents to stop
        stop_run_flag.set()
Ejemplo n.º 5
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description="Guided mode example")
    parser.add_argument("-ti", "--target_id", dest='target_id', default=0, type=int, help="Target aircraft ID")
    parser.add_argument("-fi", "--follower_id", dest='follower_id', default=0, type=int, help="Follower aircraft ID")
    parser.add_argument("-f", "--filename", dest='log_filename', default='log_000', type=str, help="Log file name")
    args = parser.parse_args()

    interface = None
    target_id = args.target_id
    follower_id = args.follower_id
    log_filename = args.log_filename
    max_duration = 100000
    log_placeholder = np.zeros((max_duration, 30))
    i=0 # for log increment

    ## Prepare Filter Specs : ##
#    fs = 5        # Sampling frequency
#    order = 2
#    cutoff = 2.0
#    nyq = 0.5 * fs
#    normal_cutoff = cutoff / nyq
#    b, a = signal.butter(order, normal_cutoff)
#    dgn = dge = dgd = signal.lfilter_zi(b, a)
    


    ### Deep guidance initialization stuff
    tf.reset_default_graph()

    # Initialize Tensorflow, and load in policy
    with tf.Session() as sess:
        # Building the policy network
        state_placeholder = tf.placeholder(dtype = tf.float32, shape = [None, Settings.OBSERVATION_SIZE], name = "state_placeholder")
        actor = BuildActorNetwork(state_placeholder, scope='learner_actor_main')
    
        # Loading in trained network weights
        print("Attempting to load in previously-trained model\n")
        saver = tf.train.Saver() # initialize the tensorflow Saver()
    
        # Try to load in policy network parameters
        try:
            ckpt = tf.train.get_checkpoint_state('../')
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("\nModel successfully loaded!\n")
    
        except (ValueError, AttributeError):
            print("No model found... quitting :(")
            raise SystemExit
    
        #######################################################################
        ### Guidance model is loaded, now get data and run it through model ###
        #######################################################################


        try:
            start_time = time.time()
            g = Guidance(interface=interface, target_id=target_id, follower_id=follower_id)
            sleep(0.1)
            # g.set_guided_mode()
            sleep(0.2)
            last_target_yaw = 0.0
            total_time = 0.0
            
            if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:                    
                # Create state-augmentation queue (holds previous actions)
                past_actions = queue.Queue(maxsize = Settings.AUGMENT_STATE_WITH_ACTION_LENGTH)
        
                # Fill it with zeros to start
                for i in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH):
                    past_actions.put(np.zeros(Settings.ACTION_SIZE), False)
            
            if Settings.AUGMENT_STATE_WITH_STATE_LENGTH > 0: 
                # Create state-augmentation queue (holds previous raw total states)
                past_states = queue.Queue(maxsize = Settings.AUGMENT_STATE_WITH_STATE_LENGTH)
                
                # Fill it with zeros to start
                for i in range(Settings.AUGMENT_STATE_WITH_STATE_LENGTH):
                    past_states.put(np.zeros(Settings.TOTAL_STATE_SIZE), False)
                
            while True:
                # TODO: make better frequency managing
                sleep(g.step)
                total_time = total_time + g.step
                # print('G IDS : ',g.ids) # debug....
                policy_input = np.zeros(Settings.TOTAL_STATE_SIZE) # initializing policy input
                for rc in g.rotorcrafts:
                    rc.timeout = rc.timeout + g.step
                    
                    
                    """ policy_input is: [chaser_x, chaser_y, chaser_z, chaser_theta, target_x, target_y, target_z, target_theta, 
                                          chaser_x_dot, chaser_y_dot, chaser_z_dot, chaser_theta_dot + (optional past action data)] 
                    
                    Note: chaser_theta_dot can be returned as a zero (it is discarded before being run through the network)
                    """
                    
                                    
                    #print('rc.W',rc.W)  # example to see the positions, or you can get the velocities as well...
                    if rc.id == target_id: # we've found the target
                        policy_input[3] =  rc.X[0] # target X [north] =   North
                        policy_input[4] = -rc.X[1] # targey Y [west]  = - East
                        policy_input[5] =  rc.X[2] # target Z [up]    =   Up
                        policy_input[6] =  np.unwrap([last_target_yaw, -rc.W[2]])[1] # target yaw  [counter-clockwise] = -yaw [clockwise]
                        last_target_yaw = policy_input[6]
                        #print("Target position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f" %(rc.X[0], -rc.X[1], rc.X[2], -rc.W[2]))
                        # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude
                    if rc.id == follower_id: # we've found the chaser (follower)
                        policy_input[0] =  rc.X[0] # chaser X [north] =   North
                        policy_input[1] = -rc.X[1] # chaser Y [west]  = - East
                        policy_input[2] =  rc.X[2] # chaser Z [up]    =   Up                        
                        
                        policy_input[7] =  rc.V[0] # chaser V_x [north] =   North
                        policy_input[8] = -rc.V[1] # chaser V_y [west]  = - East
                        policy_input[9] =  rc.V[2] # chaser V_z [up]    =   Up
                        
                        #print("Time: %.2f; Chaser position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f; Vx: %.2f; Vy: %.2f; Vz: %.2f" %(rc.timeout, rc.X[0], -rc.X[1], rc.X[2], -rc.W[2], rc.V[0], -rc.V[1], rc.V[2]))
                        # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude

                # Augment state with past action data if applicable
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:                        
                    past_action_data = np.asarray(past_actions.queue).reshape([-1]) # past actions reshaped into a column
                    
                    # Remove the oldest entry from the action log queue
                    past_actions.get(False)
                    
                    # Concatenate past actions to the policy input
                    policy_input = np.concatenate([policy_input, past_action_data])

                    
                    
                    
                ############################################################
                ##### Received data! Process it and return the result! #####
                ############################################################
        	    # Calculating the proper policy input (deleting irrelevant states and normalizing input)
                # Normalizing
                if Settings.NORMALIZE_STATE:
                    normalized_policy_input = (policy_input - Settings.STATE_MEAN)/Settings.STATE_HALF_RANGE
                else:
                    normalized_policy_input = policy_input
        
                # Discarding irrelevant states
                normalized_policy_input = np.delete(normalized_policy_input, Settings.IRRELEVANT_STATES)
        
                # Reshaping the input
                normalized_policy_input = normalized_policy_input.reshape([-1, Settings.OBSERVATION_SIZE])
        
                # Run processed state through the policy
                deep_guidance = sess.run(actor.action_scaled, feed_dict={state_placeholder:normalized_policy_input})[0]
                # deep guidance = [ chaser_angular_velocity [counter-clockwise looking down from above], chaser_x_acceleration [north], chaser_y_acceleration [west], chaser_z_acceleration [up] ]
                
                # Adding the action taken to the past_action log
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    past_actions.put(deep_guidance)
                    
                # Limit guidance commands if velocity is too high!
                # Checking whether our velocity is too large AND the acceleration is trying to increase said velocity... in which case we set the desired_linear_acceleration to zero.
                current_velocity = policy_input[7:10]
                deep_guidance[(np.abs(current_velocity) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance[1:]) == np.sign(current_velocity))] = 0 
                
#                deep_guidance[0], dgn = signal.lfilter(b, a, [deep_guidance[0]] , zi=dgn)
#                deep_guidance[1], dge = signal.lfilter(b, a, [deep_guidance[1]] , zi=dge)
#                deep_guidance[2], dgd = signal.lfilter(b, a, [deep_guidance[2]] , zi=dgd)
                
                # deep_guidance[2], deep_guidance_prev_filt[2] = signal.lfilter(b, a, deep_guidance[2], zi=deep_guidance_prev_filt[2])
                # deep_guidance[3], deep_guidance_prev_filt[3] = signal.lfilter(b, a, deep_guidance[3], zi=deep_guidance_prev_filt[3])
                # Send velocity/acceleration command to aircraft!
                #g.move_at_ned_vel( yaw=-deep_guidance[0])
                g.accelerate(north = deep_guidance[0], east = -deep_guidance[1], down = -deep_guidance[2])
                #print("Deep guidance command: a_x: %.2f; a_y: %.2f; a_z: %.2f" %( deep_guidance[1], deep_guidance[2], deep_guidance[3]))
                print("Time: %.2f; X: %.2f; Vx: %.2f; Ax: %.2f" %(total_time, policy_input[0], policy_input[7], deep_guidance[0]))
                print('Deep Guidance :',deep_guidance)

                # print('Policy input shape :',normalized_policy_input.shape)

                # Log all niput and outputs:
                t = time.time()-start_time
                log_placeholder[i,0] = t
                log_placeholder[i,1:4] = deep_guidance
                # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf
                log_placeholder[i,4:4+len(normalized_policy_input[0])] = normalized_policy_input
                i += 1
    


        except (KeyboardInterrupt, SystemExit):
            print('Shutting down...')
            g.set_nav_mode()
            g.shutdown()
            sleep(0.2)
            with open(log_filename+".txt", 'wb') as f:
                np.save(f, log_placeholder[:i])
            exit()
Ejemplo n.º 6
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description="Guided mode example")
    parser.add_argument("-ti",
                        "--target_id",
                        dest='target_id',
                        default=0,
                        type=int,
                        help="Target aircraft ID")
    parser.add_argument("-fi",
                        "--follower_id",
                        dest='follower_id',
                        default=0,
                        type=int,
                        help="Follower aircraft ID")
    parser.add_argument("-f",
                        "--filename",
                        dest='log_filename',
                        default='log_accel_000',
                        type=str,
                        help="Log file name")
    parser.add_argument("-d",
                        "--deadband",
                        dest='deadband_radius',
                        default='0.0',
                        type=float,
                        help="deadband radius")
    parser.add_argument("-no_avg",
                        "--dont_average_output",
                        dest="dont_average_output",
                        action="store_true")
    args = parser.parse_args()

    interface = None
    target_id = args.target_id
    follower_id = args.follower_id
    log_filename = args.log_filename
    deadband_radius = args.deadband_radius
    max_duration = 100000
    log_placeholder = np.zeros((max_duration, 30))
    i = 0  # for log increment

    # Flag to not average the guidance output
    dont_average_output = args.dont_average_output
    if dont_average_output:
        print("\n\nDeep guidance output is NOT averaged\n\n")
    else:
        print("\n\nDeep guidance output is averaged\n\n")

    timestep = Settings.TIMESTEP

    ### Deep guidance initialization stuff
    tf.reset_default_graph()

    # Initialize Tensorflow, and load in policy
    with tf.Session() as sess:
        # Building the policy network
        state_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, Settings.OBSERVATION_SIZE],
            name="state_placeholder")
        actor = BuildActorNetwork(state_placeholder,
                                  scope='learner_actor_main')

        # Loading in trained network weights
        print("Attempting to load in previously-trained model\n")
        saver = tf.train.Saver()  # initialize the tensorflow Saver()

        # Try to load in policy network parameters
        try:
            ckpt = tf.train.get_checkpoint_state('../')
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("\nModel successfully loaded!\n")

        except (ValueError, AttributeError):
            print("No model found... quitting :(")
            raise SystemExit

        #######################################################################
        ### Guidance model is loaded, now get data and run it through model ###
        #######################################################################

        try:
            start_time = time.time()
            g = Guidance(interface=interface,
                         quad_ids=[target_id, follower_id])
            sleep(0.1)
            # g.set_guided_mode()
            sleep(0.2)
            last_target_yaw = 0.0
            total_time = 0.0

            last_deep_guidance = np.zeros(Settings.ACTION_SIZE)

            if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                # Create state-augmentation queue (holds previous actions)
                past_actions = queue.Queue(
                    maxsize=Settings.AUGMENT_STATE_WITH_ACTION_LENGTH)

                # Fill it with zeros to start
                for i in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH):
                    past_actions.put(np.zeros(Settings.ACTION_SIZE), False)

            while True:
                # TODO: make better frequency managing
                sleep(timestep)
                total_time = total_time + timestep
                # print('G IDS : ',g.ids) # debug....
                policy_input = np.zeros(
                    Settings.TOTAL_STATE_SIZE)  # initializing policy input
                for rc in g.rotorcrafts:
                    rc.timeout = rc.timeout + timestep
                    """ policy_input is: [chaser_x, chaser_y, chaser_z, target_x, target_y, target_z, target_theta, 
                                          chaser_x_dot, chaser_y_dot, chaser_z_dot, (optional past action data)] 
                    """

                    #print('rc.W',rc.W)  # example to see the positions, or you can get the velocities as well...
                    if rc.id == target_id:  # we've found the target
                        policy_input[3] = rc.X[0]  # target X [north] =   North
                        policy_input[4] = -rc.X[1]  # targey Y [west]  = - East
                        policy_input[5] = rc.X[2]  # target Z [up]    =   Up
                        policy_input[6] = np.unwrap(
                            [last_target_yaw, -rc.W[2]]
                        )[1]  # target yaw  [counter-clockwise] = -yaw [clockwise]
                        last_target_yaw = policy_input[6]
                        #print("Target position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f" %(rc.X[0], -rc.X[1], rc.X[2], -rc.W[2]))
                        # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude
                    if rc.id == follower_id:  # we've found the chaser (follower)
                        policy_input[0] = rc.X[0]  # chaser X [north] =   North
                        policy_input[1] = -rc.X[1]  # chaser Y [west]  = - East
                        policy_input[2] = rc.X[2]  # chaser Z [up]    =   Up

                        policy_input[7] = rc.V[
                            0]  # chaser V_x [north] =   North
                        policy_input[8] = -rc.V[
                            1]  # chaser V_y [west]  = - East
                        policy_input[9] = rc.V[2]  # chaser V_z [up]    =   Up

                        #print("Time: %.2f; Chaser position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f; Vx: %.2f; Vy: %.2f; Vz: %.2f" %(rc.timeout, rc.X[0], -rc.X[1], rc.X[2], -rc.W[2], rc.V[0], -rc.V[1], rc.V[2]))
                        # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude

                # Augment state with past action data if applicable
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    past_action_data = np.asarray(past_actions.queue).reshape(
                        [-1])  # past actions reshaped into a column

                    # Remove the oldest entry from the action log queue
                    past_actions.get(False)

                    # Concatenate past actions to the policy input
                    policy_input = np.concatenate(
                        [policy_input, past_action_data])

                ############################################################
                ##### Received data! Process it and return the result! #####
                ############################################################
        # Calculating the proper policy input (deleting irrelevant states and normalizing input)
        # Normalizing
                if Settings.NORMALIZE_STATE:
                    normalized_policy_input = (
                        policy_input -
                        Settings.STATE_MEAN) / Settings.STATE_HALF_RANGE
                else:
                    normalized_policy_input = policy_input

                # Discarding irrelevant states
                normalized_policy_input = np.delete(normalized_policy_input,
                                                    Settings.IRRELEVANT_STATES)

                # Reshaping the input
                normalized_policy_input = normalized_policy_input.reshape(
                    [-1, Settings.OBSERVATION_SIZE])

                # Run processed state through the policy
                deep_guidance = sess.run(
                    actor.action_scaled,
                    feed_dict={state_placeholder: normalized_policy_input})[0]
                # deep guidance = [ chaser_x_acceleration [north], chaser_y_acceleration [west], chaser_z_acceleration [up] ]

                # Adding the action taken to the past_action log
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    past_actions.put(deep_guidance)

                # Limit guidance commands if velocity is too high!
                # Checking whether our velocity is too large AND the acceleration is trying to increase said velocity... in which case we set the desired_linear_acceleration to zero.
                current_velocity = policy_input[7:10]
                deep_guidance[
                    (np.abs(current_velocity[0:2]) > Settings.VELOCITY_LIMIT)
                    & (np.sign(deep_guidance) == np.sign(current_velocity[0:2])
                       )] = 0

                # If we are in the deadband, set the acceleration to zero!
                #                desired_location = np.array([policy_input[3]+3*np.cos(policy_input[6]), policy_input[4]+3*np.sin(policy_input[6]), policy_input[5]])
                #                current_location = policy_input[0:3]
                #                deep_guidance[np.abs((np.abs(current_location) - np.abs(desired_location))) < deadband_radius] = 0

                average_deep_guidance = (last_deep_guidance +
                                         deep_guidance) / 2.0
                last_deep_guidance = deep_guidance

                # Send velocity/acceleration command to aircraft!
                #g.accelerate(north = deep_guidance[0], east = -deep_guidance[1], down = -deep_guidance[2])

                # Since we are accelerating only in 2D, we must send an altitude command.
                desired_altitude = 3  # [m]

                if dont_average_output:
                    g.accelerate(north=deep_guidance[0],
                                 east=-deep_guidance[1],
                                 down=desired_altitude,
                                 quad_id=follower_id)
                else:
                    g.accelerate(north=average_deep_guidance[0],
                                 east=-average_deep_guidance[1],
                                 down=desired_altitude,
                                 quad_id=follower_id)  # Averaged

                #g.accelerate(north = average_deep_guidance[0], east = -average_deep_guidance[1], down = -average_deep_guidance[2], quad_id = follower_id)
                #g.accelerate(north = 1, east = 0.1, down = 0)

                print(
                    "X: %2.2f Y: %2.2f Z: %2.2f Vx: %2.2f Vy: %2.2f Vz: %2.2f Guidance_X: %.2f, Y: %.2f, Z: Alt held at %.2f"
                    % (policy_input[0], policy_input[1], policy_input[2],
                       policy_input[7], policy_input[8], policy_input[9],
                       average_deep_guidance[0], average_deep_guidance[1],
                       desired_altitude))

                # Log all input and outputs:
                t = time.time() - start_time
                log_placeholder[i, 0] = t
                log_placeholder[i, 1:3] = deep_guidance
                log_placeholder[i, 3:5] = average_deep_guidance
                # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf
                log_placeholder[i, 5:5 + len(policy_input)] = policy_input
                i += 1

        except (KeyboardInterrupt, SystemExit):
            print('Shutting down...')
            g.shutdown()
            sleep(0.2)
            with open(log_filename + ".txt", 'wb') as f:
                np.save(f, log_placeholder[:i])
            exit()
Ejemplo n.º 7
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description="Guided mode example")
    parser.add_argument("-ti",
                        "--target_id",
                        dest='target_id',
                        default=0,
                        type=int,
                        help="Target aircraft ID")
    parser.add_argument("-fi",
                        "--follower_id",
                        dest='follower_id',
                        default=0,
                        type=int,
                        help="Follower aircraft ID")
    parser.add_argument("-f",
                        "--filename",
                        dest='log_filename',
                        default='log_velocity_000',
                        type=str,
                        help="Log file name")
    args = parser.parse_args()

    interface = None
    target_id = args.target_id
    follower_id = args.follower_id
    log_filename = args.log_filename
    max_duration = 100000
    log_placeholder = np.zeros((max_duration, 30))
    i = 0  # for log increment

    timestep = Settings.TIMESTEP

    ### Deep guidance initialization stuff
    tf.reset_default_graph()

    # Initialize Tensorflow, and load in policy
    with tf.Session() as sess:
        # Building the policy network
        state_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, Settings.OBSERVATION_SIZE],
            name="state_placeholder")
        actor = BuildActorNetwork(state_placeholder,
                                  scope='learner_actor_main')

        # Loading in trained network weights
        print("Attempting to load in previously-trained model\n")
        saver = tf.train.Saver()  # initialize the tensorflow Saver()

        # Try to load in policy network parameters
        try:
            ckpt = tf.train.get_checkpoint_state('../')
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("\nModel successfully loaded!\n")

        except (ValueError, AttributeError):
            print("No model found... quitting :(")
            raise SystemExit

        #######################################################################
        ### Guidance model is loaded, now get data and run it through model ###
        #######################################################################

    ### End deep guidance initialization stuff

        try:
            start_time = time.time()
            g = Guidance(interface=interface,
                         quad_ids=[target_id, follower_id])
            sleep(0.1)
            g.set_guided_mode(quad_id=follower_id)
            sleep(0.2)
            last_target_yaw = 0.0

            if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                # Create state-augmentation queue (holds previous actions)
                past_actions = queue.Queue(
                    maxsize=Settings.AUGMENT_STATE_WITH_ACTION_LENGTH)

                # Fill it with zeros to start
                for i in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH):
                    past_actions.put(np.zeros(Settings.ACTION_SIZE), False)

            while True:
                # TODO: make better frequency managing
                sleep(timestep)
                # print('G IDS : ',g.ids) # debug....
                policy_input = np.zeros(
                    Settings.TOTAL_STATE_SIZE)  # initializing policy input
                for rc in g.rotorcrafts:
                    rc.timeout = rc.timeout + timestep
                    # print('rc.id',rc.id)
                    #print('rc.W',rc.W)  # example to see the positions, or you can get the velocities as well...
                    if rc.id == target_id:  # we've found the target
                        policy_input[3] = rc.X[0]  # target X [north] =   North
                        policy_input[4] = -rc.X[1]  # targey Y [west]  = - East
                        policy_input[5] = rc.X[2]  # target Z [up]    =   Up
                        policy_input[6] = np.unwrap(
                            [last_target_yaw, -rc.W[2]]
                        )[1]  # target yaw  [counter-clockwise] = -yaw [clockwise]
                        last_target_yaw = policy_input[6]
                        # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude
                    if rc.id == follower_id:  # we've found the chaser (follower)
                        policy_input[0] = rc.X[0]  # chaser X [north] =   North
                        policy_input[1] = -rc.X[1]  # chaser Y [west]  = - East
                        policy_input[2] = rc.X[2]  # chaser Z [up]    =   Up

                        # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude

                # Augment state with past action data if applicable
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    past_action_data = np.asarray(past_actions.queue).reshape(
                        [-1])  # past actions reshaped into a column

                    # Remove the oldest entry from the action log queue
                    past_actions.get(False)

                    # Concatenate past actions to the policy input
                    policy_input = np.concatenate(
                        [policy_input, past_action_data])

                ############################################################
                ##### Received data! Process it and return the result! #####
                ############################################################
        # Calculating the proper policy input (deleting irrelevant states and normalizing input)

        # Normalizing
                if Settings.NORMALIZE_STATE:
                    normalized_policy_input = (
                        policy_input -
                        Settings.STATE_MEAN) / Settings.STATE_HALF_RANGE
                else:
                    normalized_policy_input = policy_input

                # Discarding irrelevant states
                normalized_policy_input = np.delete(normalized_policy_input,
                                                    Settings.IRRELEVANT_STATES)

                # Reshaping the input
                normalized_policy_input = normalized_policy_input.reshape(
                    [-1, Settings.OBSERVATION_SIZE])

                # Run processed state through the policy
                deep_guidance = sess.run(
                    actor.action_scaled,
                    feed_dict={state_placeholder: normalized_policy_input})[0]
                # deep guidance = [chaser_x_velocity [north], chaser_y_velocity [west], chaser_z_velocity [up], chaser_angular_velocity [counter-clockwise looking down from above]]

                # Adding the action taken to the past_action log
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    past_actions.put(deep_guidance)

                # Send velocity command to aircraft!
                g.move_at_ned_vel(north=deep_guidance[0],
                                  east=-deep_guidance[1],
                                  down=0,
                                  quad_id=follower_id)
                print("Policy input: ", policy_input,
                      "Deep guidance command: ", deep_guidance)

                # Log all input and outputs:
                t = time.time() - start_time
                log_placeholder[i, 0] = t
                log_placeholder[i, 1:3] = deep_guidance
                log_placeholder[i, 3:5] = deep_guidance
                # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf
                log_placeholder[i, 5:5 + len(policy_input)] = policy_input
                i += 1

        except (KeyboardInterrupt, SystemExit):
            print('Shutting down...')
            g.set_nav_mode(quad_id=follower_id)
            g.shutdown()
            sleep(0.2)
            with open(log_filename + ".txt", 'wb') as f:
                np.save(f, log_placeholder[:i])
            exit()
Ejemplo n.º 8
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description="Guided mode example")
    parser.add_argument('-ids',
                        '--quad_ids',
                        nargs='+',
                        help='<Required> IDs of all quads used',
                        required=True)
    parser.add_argument("-f",
                        "--filename",
                        dest='log_filename',
                        default='log_runway_000',
                        type=str,
                        help="Log file name")
    parser.add_argument("-L",
                        "--new_length",
                        dest='new_length',
                        default=Settings.RUNWAY_LENGTH,
                        type=float,
                        help="Override the 124 m runway length")
    parser.add_argument("-W",
                        "--new_width",
                        dest='new_width',
                        default=Settings.RUNWAY_WIDTH,
                        type=float,
                        help="Override the 12.5 m runway width")
    parser.add_argument("-alt",
                        "--altitude",
                        dest='altitude',
                        default=2,
                        type=float,
                        help="Override the 2 m first quad altitude")
    parser.add_argument("-no_avg",
                        "--dont_average_output",
                        dest="dont_average_output",
                        action="store_true")
    parser.add_argument("-fail",
                        "--failure_times",
                        nargs='+',
                        dest="failure_times",
                        default=[9999.9])
    args = parser.parse_args()

    failure_times = list(map(float, args.failure_times))

    if Settings.INDOORS:
        runway_angle_wrt_north = 0
    else:
        runway_angle_wrt_north = (156.485 - 90) * np.pi / 180  #[rad]
    C_bI = make_C_bI(
        runway_angle_wrt_north
    )  # [3x3] rotation matrix from North (I) to body (tilted runway)
    C_bI_22 = make_C_bI_22(
        runway_angle_wrt_north
    )  # [2x2] rotation matrix from North (I) to body (tilted runway)

    if args.new_length != Settings.RUNWAY_LENGTH:
        print(
            "\nOverwriting %.1f m runway length with user-defined %.1f m runway length."
            % (Settings.RUNWAY_LENGTH, args.new_length))
    runway_length = args.new_length
    if args.new_width != Settings.RUNWAY_WIDTH:
        print(
            "\nOverwriting %.1f m runway width with user-defined %.1f m runway width."
            % (Settings.RUNWAY_WIDTH, args.new_width))
    runway_width = args.new_width

    first_quad_altitude = args.altitude

    # Communication delay length in timesteps
    COMMUNICATION_DELAY_LENGTH = 0  # timesteps
    if COMMUNICATION_DELAY_LENGTH > 0:
        print("\nA simulated communication delay of %.1f seconds is used" %
              (COMMUNICATION_DELAY_LENGTH * Settings.TIMESTEP))

    interface = None
    not_done = True

    failure_acknowledged = np.tile(False, len(failure_times))

    average_deep_guidance_NED = np.zeros(
        [Settings.NUMBER_OF_QUADS, Settings.ACTION_SIZE])

    # converting this input from a list of strings to a list of ints
    all_ids = list(map(int, args.quad_ids))

    log_filename = args.log_filename
    max_duration = 100000
    log_placeholder = np.zeros(
        (max_duration,
         3 * Settings.NUMBER_OF_QUADS + 1 + Settings.TOTAL_STATE_SIZE + 6))
    log_counter = 0  # for log increment

    # Flag to not average the guidance output
    dont_average_output = args.dont_average_output
    if dont_average_output:
        print("\n\nDeep guidance output is NOT averaged\n\n")
    else:
        print("\n\nDeep guidance output is averaged\n\n")

    timestep = Settings.TIMESTEP

    # Generate Polygons for runway tiles
    # The size of each runway grid element
    each_runway_length_element = Settings.RUNWAY_LENGTH / Settings.RUNWAY_LENGTH_ELEMENTS
    each_runway_width_element = Settings.RUNWAY_WIDTH / Settings.RUNWAY_WIDTH_ELEMENTS
    tile_polygons = []
    for i in range(Settings.RUNWAY_LENGTH_ELEMENTS):
        this_row = []
        for j in range(Settings.RUNWAY_WIDTH_ELEMENTS):
            # make the polygon
            this_row.append(
                Polygon([[
                    each_runway_length_element * i -
                    Settings.RUNWAY_LENGTH / 2,
                    each_runway_width_element * j - Settings.RUNWAY_WIDTH / 2
                ],
                         [
                             each_runway_length_element * (i + 1) -
                             Settings.RUNWAY_LENGTH / 2,
                             each_runway_width_element * j -
                             Settings.RUNWAY_WIDTH / 2
                         ],
                         [
                             each_runway_length_element * (i + 1) -
                             Settings.RUNWAY_LENGTH / 2,
                             each_runway_width_element * (j + 1) -
                             Settings.RUNWAY_WIDTH / 2
                         ],
                         [
                             each_runway_length_element * i -
                             Settings.RUNWAY_LENGTH / 2,
                             each_runway_width_element * (j + 1) -
                             Settings.RUNWAY_WIDTH / 2
                         ]]))

        tile_polygons.append(this_row)

    ### Deep guidance initialization stuff
    tf.reset_default_graph()

    # Initialize Tensorflow, and load in policy
    with tf.Session() as sess:
        # Building the policy network
        state_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, Settings.OBSERVATION_SIZE],
            name="state_placeholder")
        actor = BuildActorNetwork(state_placeholder,
                                  scope='learner_actor_main')

        # Loading in trained network weights
        print("Attempting to load in previously-trained model\n")
        saver = tf.train.Saver()  # initialize the tensorflow Saver()

        # Try to load in policy network parameters
        try:
            ckpt = tf.train.get_checkpoint_state('../')
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("\nModel successfully loaded!\n")

        except (ValueError, AttributeError):
            print("No model found... quitting :(")
            raise SystemExit

        #######################################################################
        ### Guidance model is loaded, now get data and run it through model ###
        #######################################################################

        try:
            start_time = time.time()
            g = Guidance(interface=interface, quad_ids=all_ids)
            sleep(0.1)
            # g.set_guided_mode()
            sleep(0.2)

            total_time = 0.0

            last_deep_guidance = np.zeros(Settings.ACTION_SIZE)

            if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                # Create state-augmentation queue (holds previous actions)
                past_actions = queue.Queue(
                    maxsize=Settings.AUGMENT_STATE_WITH_ACTION_LENGTH)

                # Fill it with zeros to start
                for j in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH):
                    past_actions.put(
                        np.zeros(
                            [Settings.NUMBER_OF_QUADS, Settings.ACTION_SIZE]),
                        False)

            # Initializing
            runway_state = np.zeros([
                Settings.RUNWAY_LENGTH_ELEMENTS, Settings.RUNWAY_WIDTH_ELEMENTS
            ])
            # A list of all the tiles that rewards should not be returned for
            if Settings.RUNWAY_SHAPE == 'R':
                blank_tiles = []
                print("Rectangular-shaped runway is used!")

            elif Settings.RUNWAY_SHAPE == 'L':
                blank_tiles = list(
                    ([2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [7, 0], [2, 1],
                     [3, 1], [4, 1], [5, 1], [6, 1], [7, 1]))
                print("L-shaped runway is used!")

            elif Settings.RUNWAY_SHAPE == 'C':
                blank_tiles = list(
                    ([2, 0], [3, 0], [4, 0], [5, 0], [2, 1], [3,
                                                              1], [4,
                                                                   1], [5, 1]))
                print("C-shaped runway is used!")

            # Assign tiles we don't care about to 1 (meaning they won't give any reward)
            for i in range(len(blank_tiles)):
                runway_state[blank_tiles[i][0], blank_tiles[i][1]] = 1

            last_runway_state = np.copy(runway_state)
            desired_altitudes = np.linspace(first_quad_altitude +
                                            Settings.NUMBER_OF_QUADS * 2,
                                            first_quad_altitude,
                                            Settings.NUMBER_OF_QUADS,
                                            endpoint=False)

            while not_done:
                # TODO: make better frequency managing
                sleep(timestep)

                # Initializing quadrotor positions and velocities
                quad_positions = np.zeros([Settings.NUMBER_OF_QUADS, 3])
                quad_velocities = np.zeros([Settings.NUMBER_OF_QUADS, 3])

                quad_number_not_id = 0
                for rc in g.rotorcrafts:

                    rc.timeout = rc.timeout + timestep
                    """ policy_input is: [chaser_x, chaser_y, chaser_z, target_x, target_y, target_z, target_theta, 
                                          chaser_x_dot, chaser_y_dot, chaser_z_dot, (optional past action data)] 
                    """

                    # Extracting position
                    quad_positions[quad_number_not_id, 0] = rc.X[0]
                    quad_positions[quad_number_not_id, 1] = -rc.X[1]
                    quad_positions[quad_number_not_id, 2] = rc.X[2]

                    # Rotating from Inertial frame (NED frame) into body frame (runway frame)
                    quad_positions[quad_number_not_id, :] = np.matmul(
                        C_bI, quad_positions[quad_number_not_id, :])

                    # Scale position after rotating
                    quad_positions[quad_number_not_id, 0] = quad_positions[
                        quad_number_not_id,
                        0] * Settings.RUNWAY_LENGTH / runway_length  # scaling to the new runway length
                    quad_positions[quad_number_not_id, 1] = quad_positions[
                        quad_number_not_id,
                        1] * Settings.RUNWAY_WIDTH / runway_width  # scaling to the new runway wodth

                    # Extracting velocity
                    quad_velocities[quad_number_not_id, 0] = rc.V[
                        0]  #*Settings.RUNWAY_LENGTH/runway_length
                    quad_velocities[
                        quad_number_not_id,
                        1] = -rc.V[1]  #*Settings.RUNWAY_WIDTH/runway_width
                    quad_velocities[quad_number_not_id, 2] = rc.V[2]

                    # Rotating from Inertial frame (NED frame) into body frame (runway frame)
                    quad_velocities[quad_number_not_id, :] = np.matmul(
                        C_bI, quad_velocities[quad_number_not_id, :])

                    # Scale velocities after rotating
                    quad_velocities[quad_number_not_id, 0] = quad_velocities[
                        quad_number_not_id,
                        0] * Settings.RUNWAY_LENGTH / runway_length
                    quad_velocities[quad_number_not_id, 1] = quad_velocities[
                        quad_number_not_id,
                        1] * Settings.RUNWAY_WIDTH / runway_width

                    quad_number_not_id += 1

                # Resetting the action delay queue
                if total_time == 0.0:
                    if COMMUNICATION_DELAY_LENGTH > 0:
                        communication_delay_queue = queue.Queue(
                            maxsize=COMMUNICATION_DELAY_LENGTH + 1)
                        # Fill it with zeros initially
                        for i in range(COMMUNICATION_DELAY_LENGTH):
                            communication_delay_queue.put(
                                [quad_positions, quad_velocities], False)

                    # Resetting the initial previous position to be the first position
                    previous_quad_positions = quad_positions

                # Checking if a quadrotor has failed
                for i in range(len(failure_times)):
                    if (total_time > failure_times[i]) and (
                            not failure_acknowledged[i]):
                        if total_time == failure_times[i]:
                            print("\n\nQuad %i has failed!\n\n" % i)
                        # Force the position and velocity to their 'failed' states
                        quad_positions[i, :] = Settings.LOWER_STATE_BOUND[:3]
                        previous_quad_positions[i, :] = quad_positions[i, :]
                        quad_velocities[i, :] = np.zeros([3])

                if COMMUNICATION_DELAY_LENGTH > 0:
                    communication_delay_queue.put(
                        [quad_positions, quad_velocities], False
                    )  # puts the current position and velocity to the bottom of the stack
                    delayed_quad_positions, delayed_quad_velocities = communication_delay_queue.get(
                        False)  # grabs the delayed position and velocity.

                ##############################################################
                ### Check the runway for new tiles that have been explored ###
                ##############################################################
                # Generate quadrotor LineStrings
                for i in range(Settings.NUMBER_OF_QUADS):
                    quad_line = LineString([
                        quad_positions[i, :-1], previous_quad_positions[i, :-1]
                    ])

                    for j in range(Settings.RUNWAY_LENGTH_ELEMENTS):
                        for k in range(Settings.RUNWAY_WIDTH_ELEMENTS):
                            # If this element has already been explored, skip it
                            if runway_state[j,
                                            k] == 0 and quad_line.intersects(
                                                tile_polygons[j][k]):
                                runway_state[j, k] = 1
                                #print("Quad %i traced the line %s and explored runway element length = %i, width = %i with coordinates %s" %(i,list(quad_line.coords),j,k,tile_polygons[j][k].bounds))

                # Storing current quad positions for the next timestep
                previous_quad_positions = quad_positions

                # Print if a new tile has been explored
                if np.any(last_runway_state != runway_state):
                    print(
                        "Runway elements discovered %i/%i" %
                        (np.sum(runway_state), Settings.RUNWAY_LENGTH_ELEMENTS
                         * Settings.RUNWAY_WIDTH_ELEMENTS))

                    # Draw a new runway
                    print(np.flip(runway_state))

                if np.all(runway_state) == 1:
                    print(
                        "Explored the entire runway in %.2f seconds--Congratualtions! Quitting deep guidance"
                        % (time.time() - start_time))
                    not_done = False

                total_states = []
                # Building NUMBER_OF_QUADS states
                for j in range(Settings.NUMBER_OF_QUADS):
                    # Start state with your own
                    this_quads_state = np.concatenate(
                        [quad_positions[j, :], quad_velocities[j, :]])
                    # Add in the others' states, starting with the next quad and finishing with the previous quad
                    for k in range(j + 1, Settings.NUMBER_OF_QUADS + j):
                        if COMMUNICATION_DELAY_LENGTH > 0:
                            this_quads_state = np.concatenate([
                                this_quads_state, delayed_quad_positions[
                                    k % Settings.NUMBER_OF_QUADS, :],
                                delayed_quad_velocities[
                                    k % Settings.NUMBER_OF_QUADS, :]
                            ])
                        else:
                            this_quads_state = np.concatenate([
                                this_quads_state,
                                quad_positions[k %
                                               Settings.NUMBER_OF_QUADS, :],
                                quad_velocities[k %
                                                Settings.NUMBER_OF_QUADS, :]
                            ])

                    # All quad data is included, now append the runway state and save it to the total_state
                    total_states.append(this_quads_state)

                # Augment total_state with past actions, if appropriate
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    # total_states = [Settings.NUMBER_OF_QUADS, Settings.TOTAL_STATE_SIZE]
                    # Just received a total_state from the environment, need to augment
                    # it with the past action data and return it
                    # The past_action_data is of shape [Settings.AUGMENT_STATE_WITH_ACTION_LENGTH, Settings.NUMBER_OF_QUADS, Settings.TOTAL_STATE_SIZE]
                    # I swap the first and second axes so that I can reshape it properly

                    past_action_data = np.swapaxes(
                        np.asarray(past_actions.queue), 0, 1).reshape([
                            Settings.NUMBER_OF_QUADS, -1
                        ])  # past actions reshaped into rows for each quad
                    total_states = np.concatenate(
                        [np.asarray(total_states), past_action_data], axis=1)

                    # Remove the oldest entry from the action log queue
                    past_actions.get(False)

                # Concatenating the runway to the augmented state
                total_states = np.concatenate([
                    total_states,
                    np.tile(runway_state.reshape(-1),
                            (Settings.NUMBER_OF_QUADS, 1))
                ],
                                              axis=1)

                total_state_to_log = total_states[0, :]

                # Normalize the state
                if Settings.NORMALIZE_STATE:
                    total_states = (total_states - Settings.STATE_MEAN
                                    ) / Settings.STATE_HALF_RANGE

                # Discarding irrelevant states
                observations = np.delete(total_states,
                                         Settings.IRRELEVANT_STATES,
                                         axis=1)

                # Run processed state through the policy
                deep_guidance = sess.run(
                    actor.action_scaled,
                    feed_dict={state_placeholder: observations}
                )  # deep guidance = [ chaser_x_acceleration [runway north], chaser_y_acceleration [runway west], chaser_z_acceleration [up] ]

                # Adding the action taken to the past_action log
                if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0:
                    past_actions.put(deep_guidance)

                # Limit guidance commands if velocity is too high!
                # Checking whether our velocity is too large AND the acceleration is trying to increase said velocity... in which case we set the desired_linear_acceleration to zero.
                for j in range(Settings.NUMBER_OF_QUADS):
                    #print(quad_velocities[j,0:2], (np.abs(quad_velocities[j,0:2]) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance[j,:]) == np.sign(quad_velocities[j,0:2])), "Unsaturated: ", deep_guidance, end=' ')
                    deep_guidance[
                        j, (np.abs(quad_velocities[
                            j, 0:2]) > Settings.VELOCITY_LIMIT) &
                        (np.sign(deep_guidance[j, :]) ==
                         np.sign(quad_velocities[j, 0:2]))] = -2 * np.sign(
                             deep_guidance[j, (np.abs(quad_velocities[
                                 j, 0:2]) > Settings.VELOCITY_LIMIT) &
                                           (np.sign(deep_guidance[j, :]) ==
                                            np.sign(quad_velocities[j, 0:2]))])
                    #print("Saturated: ", deep_guidance, end =' ')

                average_deep_guidance = (last_deep_guidance +
                                         deep_guidance) / 2.0
                last_deep_guidance = deep_guidance
                last_runway_state = np.copy(runway_state)

                # Get each quad to accelerate appropriately
                for j in range(Settings.NUMBER_OF_QUADS):

                    # Checking if a quadrotor has failed
                    for i in range(len(failure_times)):
                        if (total_time > failure_times[i]) and (i == j):
                            if np.abs(total_time - failure_times[i]) < 0.5:
                                print("\n\nSimulated failure of quad %i\n\n" %
                                      (all_ids[i]))
                            skip_this_one = True
                            break
                        else:
                            skip_this_one = False
                    if skip_this_one:
                        continue

                    # Each quad is assigned a different altitude to remain at.
                    desired_altitude = desired_altitudes[j]

                    # Rotate desired deep guidance command from body frame (runway frame) frame to inertial frame (NED frame)
                    #print("Unrotated average: ", average_deep_guidance)
                    average_deep_guidance_NED[j, :] = np.matmul(
                        C_bI_22.T, average_deep_guidance[j, :])
                    #print("Rotated average: ", average_deep_guidance)

                    if dont_average_output:
                        g.accelerate(north=deep_guidance[j, 0],
                                     east=-deep_guidance[j, 1],
                                     down=desired_altitude,
                                     quad_id=g.ids[j])
                        print("The no_avg setting is broken")
                        raise SystemExit
                    else:
                        g.accelerate(north=average_deep_guidance_NED[j, 0],
                                     east=-average_deep_guidance_NED[j, 1],
                                     down=desired_altitude,
                                     quad_id=g.ids[j])  # Averaged

                total_time = total_time + timestep
                # Log all input and outputs:
                t = time.time() - start_time
                log_placeholder[log_counter, 0] = t
                log_placeholder[log_counter, 1:3 * Settings.NUMBER_OF_QUADS +
                                1] = np.concatenate([
                                    average_deep_guidance.reshape(-1),
                                    desired_altitudes.reshape(-1)
                                ])
                # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf
                log_placeholder[log_counter, 3 * Settings.NUMBER_OF_QUADS +
                                1:3 * Settings.NUMBER_OF_QUADS + 1 +
                                Settings.TOTAL_STATE_SIZE +
                                6] = total_state_to_log
                log_counter += 1

            # If we ended gracefully
            exit()

        # If we ended forcefully
        except (KeyboardInterrupt, SystemExit):
            print('Shutting down...')
            g.shutdown()
            sleep(0.2)
            print("Saving file as %s..." %
                  (log_filename + "_L" + str(runway_length) + "_W" +
                   str(runway_width) + ".txt"))
            with open(
                    log_filename + "_L" + str(runway_length) + "_W" +
                    str(runway_width) + ".txt", 'wb') as f:
                np.save(f, log_placeholder[:log_counter])
            print("Done!")
            exit()
Ejemplo n.º 9
0
    def __init__(self, testing, client_socket, messages_to_deep_guidance,
                 stop_run_flag):

        print("Initializing deep guidance model runner")
        self.client_socket = client_socket
        self.messages_to_deep_guidance = messages_to_deep_guidance
        self.stop_run_flag = stop_run_flag
        self.testing = testing

        ###############################
        ### User-defined parameters ###
        ###############################
        self.offset_x = 0  # Docking offset in the body frame
        self.offset_y = 0  # Docking offset in the body frame
        self.offset_angle = 0

        # So we can access old Pi positions while we wait for new SPOTNet images to come in
        self.pi_position_queue = deque(maxlen=round(CAMERA_PROCESSING_TIME /
                                                    PHASESPACE_TIMESTEP))

        # Loading the queue up with zeros
        for i in range(round(CAMERA_PROCESSING_TIME / PHASESPACE_TIMESTEP)):
            self.pi_position_queue.append((0., 0., 0.))

        # Holding the previous position so we know when SPOTNet gives a new update
        self.previousSPOTNet_relative_x = 0.0
        """ Holding the chaser's x, y, and theta position when a SPOTNet image was taken
            which we assume is at the same moment the previous results are received.
            This is to ensure the ~0.7 s SPOTNet processing time isn't poorly reflected
            in the target's estimated inertial position
        """
        self.chaser_x_when_image_was_taken = 0
        self.chaser_y_when_image_was_taken = 0
        self.chaser_theta_when_image_was_taken = 0

        self.SPOTNet_target_x_inertial = 0
        self.SPOTNet_target_y_inertial = 0
        self.SPOTNet_target_angle_inertial = 0

        # Uncomment this on TF2.0
        # tf.compat.v1.disable_eager_execution()

        # Clear any old graph
        tf.reset_default_graph()

        # Initialize Tensorflow, and load in policy
        self.sess = tf.Session()
        # Building the policy network
        self.state_placeholder = tf.placeholder(
            dtype=tf.float32,
            shape=[None, Settings.OBSERVATION_SIZE],
            name="state_placeholder")
        self.actor = BuildActorNetwork(self.state_placeholder,
                                       scope='learner_actor_main')

        # Loading in trained network weights
        print("Attempting to load in previously-trained model\n")
        saver = tf.train.Saver()  # initialize the tensorflow Saver()

        # Try to load in policy network parameters
        try:
            ckpt = tf.train.get_checkpoint_state('../')
            saver.restore(self.sess, ckpt.model_checkpoint_path)
            print("\nModel successfully loaded!\n")

        except (ValueError, AttributeError):
            print("No model found... quitting :(")
            raise SystemExit

        print("Done initializing model!")