Esempio n. 1
0
class DistributedAgent():
    def __init__(self, parameters):
        required_parameters = [
            'data_dir', 'max_epoch_runtime_sec', 'replay_memory_size',
            'batch_size', 'min_epsilon', 'per_iter_epsilon_reduction',
            'experiment_name', 'train_conv_layers'
        ]
        for required_parameter in required_parameters:
            if required_parameter not in parameters:
                raise ValueError('Missing required parameter {0}'.format(
                    required_parameter))

        parameters['role_type'] = 'agent'

        print('Starting time: {0}'.format(datetime.datetime.utcnow()),
              file=sys.stderr)
        self.__model_buffer = None
        self.__model = "sample_model.json"
        self.__airsim_started = False
        self.__data_dir = parameters['data_dir']
        self.__per_iter_epsilon_reduction = float(
            parameters['per_iter_epsilon_reduction'])
        self.__min_epsilon = float(parameters['min_epsilon'])
        self.__max_epoch_runtime_sec = float(
            parameters['max_epoch_runtime_sec'])
        self.__replay_memory_size = int(parameters['replay_memory_size'])
        self.__batch_size = int(parameters['batch_size'])
        self.__experiment_name = parameters['experiment_name']
        self.__train_conv_layers = bool(
            (parameters['train_conv_layers'].lower().strip() == 'true'))
        self.__epsilon = 1
        self.__num_batches_run = 0
        self.__last_checkpoint_batch_count = 0

        if 'batch_update_frequency' in parameters:
            self.__batch_update_frequency = int(
                parameters['batch_update_frequency'])

        if 'weights_path' in parameters:
            self.__weights_path = parameters['weights_path']
        else:
            self.__weights_path = None

        if 'airsim_path' in parameters:
            self.__airsim_path = parameters['airsim_path']
        else:
            self.__airsim_path = None

        self.__local_run = 'local_run' in parameters

        self.__car_client = airsim.CarClient()
        self.__car_controls = airsim.CarControls()

        self.__minibatch_dir = os.path.join(self.__data_dir, 'minibatches')
        self.__output_model_dir = os.path.join(self.__data_dir, 'models')

        self.__make_dir_if_not_exist(self.__minibatch_dir)
        self.__make_dir_if_not_exist(self.__output_model_dir)
        self.__last_model_file = ''

        self.__possible_ip_addresses = []
        self.__trainer_ip_address = None

        self.__experiences = {}

        self.__init_road_points()
        self.__init_reward_points()

    # Starts the agent
    def start(self):
        self.__run_function()

    # The function that will be run during training.
    # It will initialize the connection to the trainer, start AirSim, and continuously run training iterations.
    def __run_function(self):
        print('Starting run function')

        # Once the trainer is online, it will write its IP to a file in (data_dir)\trainer_ip\trainer_ip.txt
        # Wait for that file to exist
        if not self.__local_run:
            while True:
                trainer_ip_dir = os.path.join(
                    os.path.join(self.__data_dir, 'trainer_ip'),
                    self.__experiment_name)
                print('Checking {0}...'.format(trainer_ip_dir))
                if os.path.isdir(trainer_ip_dir):
                    with open(os.path.join(trainer_ip_dir, 'trainer_ip.txt'),
                              'r') as f:
                        self.__possible_ip_addresses.append(f.read().replace(
                            '\n', ''))
                        break
                time.sleep(5)

            # We now have the IP address for the trainer. Attempt to ping the trainer.
            ping_idx = -1
            while True:
                ping_idx += 1
                try:
                    print('\tPinging {0}...'.format(
                        self.__possible_ip_addresses[ping_idx % len(
                            self.__possible_ip_addresses)]))
                    response = requests.get('http://{0}:80/ping'.format(
                        self.__possible_ip_addresses[ping_idx % len(
                            self.__possible_ip_addresses)])).json()
                    if response['message'] != 'pong':
                        raise ValueError(
                            'Received unexpected message: {0}'.format(
                                response))
                    print('Success!')
                    self.__trainer_ip_address = self.__possible_ip_addresses[
                        ping_idx % len(self.__possible_ip_addresses)]
                    break
                except Exception as e:
                    print('Could not get response. Message is {0}'.format(e))
                    if (ping_idx % len(self.__possible_ip_addresses) == 0):
                        print('Waiting 5 seconds and trying again...')
                        time.sleep(5)

            print('Getting model from the trainer')
            sys.stdout.flush()
            self.__model = RlModel(self.__weights_path,
                                   self.__train_conv_layers)
            self.__get_latest_model()

        else:
            print('Run is local. Skipping connection to trainer.')
            self.__model = RlModel(self.__weights_path,
                                   self.__train_conv_layers)

        self.__connect_to_airsim()

        while True:
            try:
                self.__run_airsim_epoch(True)
                percent_full = 100.0 * len(
                    self.__experiences['actions']) / self.__replay_memory_size

                if (percent_full >= 100.0):
                    break
            except msgpackrpc.error.TimeoutError:
                self.__connect_to_airsim()

        if not self.__local_run:
            self.__get_latest_model()
        while True:
            try:
                if (self.__model is not None):
                    experiences, frame_count = self.__run_airsim_epoch(False)
                    # If we didn't immediately crash, train on the gathered experiences
                    if (frame_count > 0):
                        sampled_experiences = self.__sample_experiences(
                            experiences, frame_count, True)
                        self.__num_batches_run += frame_count
                        # If we successfully sampled, train on the collected minibatches and send the gradients to the trainer node
                        if (len(sampled_experiences) > 0):
                            print('Publishing AirSim Epoch.')
                            self.__publish_batch_and_update_model(
                                sampled_experiences, frame_count)

            except msgpackrpc.error.TimeoutError:
                print('Lost connection to AirSim. Attempting to reconnect.')
                self.__connect_to_airsim()

    # Connects to the AirSim Exe.
    def __connect_to_airsim(self):
        attempt_count = 0
        while True:
            try:
                print('Attempting to connect to AirSim (attempt {0})'.format(
                    attempt_count))
                self.__car_client = airsim.CarClient()
                self.__car_client.confirmConnection()
                self.__car_client.enableApiControl(True)
                self.__car_controls = airsim.CarControls()
                return
            except:
                print('Failed to connect.')
                attempt_count += 1
                if (attempt_count % 10 == 0):
                    print(
                        '10 consecutive failures to connect. Attempting to start AirSim on my own.'
                    )

                    if self.__local_run:
                        os.system('START "" powershell.exe {0}'.format(
                            os.path.join(
                                self.__airsim_path,
                                'AD_Cookbook_Start_AirSim.ps1 neighborhood -windowed'
                            )))
                    else:
                        os.system(
                            'START "" powershell.exe D:\\AD_Cookbook_AirSim\\Scripts\\DistributedRL\\restart_airsim_if_agent.ps1'
                        )
                time.sleep(10)

    # Appends a sample to a ring buffer.
    # If the appended example takes the size of the buffer over buffer_size, the example at the front will be removed.
    def __append_to_ring_buffer(self, item, buffer, buffer_size):
        if (len(buffer) >= buffer_size):
            buffer = buffer[1:]
        buffer.append(item)
        return buffer

    # Runs an interation of data generation from AirSim.
    # Data will be saved in the replay memory.
    def __run_airsim_epoch(self, always_random):
        print('Running AirSim epoch.')

        # Pick a random starting point on the roads
        starting_points, starting_direction = self.__get_next_starting_point()

        # Initialize the state buffer.
        # For now, save 4 images at 0.01 second intervals.
        state_buffer_len = 4
        state_buffer = []
        wait_delta_sec = 0.01

        self.__car_controls.steering = 0
        self.__car_controls.throttle = 0
        self.__car_controls.brake = 1
        self.__car_client.setCarControls(self.__car_controls)
        time.sleep(2)

        # While the car is rolling, start initializing the state buffer
        stop_run_time = datetime.datetime.now() + datetime.timedelta(seconds=2)
        while (datetime.datetime.now() < stop_run_time):
            time.sleep(wait_delta_sec)
            state_buffer = self.__append_to_ring_buffer(
                self.__get_image(), state_buffer, state_buffer_len)
        done = False
        actions = []  #records the state we go to
        pre_states = []
        post_states = []
        rewards = []
        predicted_rewards = []
        car_state = self.__car_client.getCarState()

        start_time = datetime.datetime.utcnow()
        end_time = start_time + datetime.timedelta(
            seconds=self.__max_epoch_runtime_sec)

        num_random = 0
        far_off = False

        # Main data collection loop
        while not done:
            collision_info = self.__car_client.simGetCollisionInfo()
            utc_now = datetime.datetime.utcnow()

            # Check for terminal conditions:
            # 1) Car has collided
            # 2) Car is stopped
            # 3) The run has been running for longer than max_epoch_runtime_sec.
            # 4) The car has run off the road
            if (collision_info.has_collided or car_state.speed < 2
                    or far_off):  # or utc_now > end_time or far_off):
                self.__car_client.reset()
                sys.stderr.flush()
            else:

                # The Agent should occasionally pick random action instead of best action
                do_greedy = np.random.random_sample()
                pre_state = copy.deepcopy(state_buffer)
                if (do_greedy < self.__epsilon or always_random):
                    num_random += 1
                    next_state = self.__model.get_random_state()
                    predicted_reward = 0

                else:
                    next_state, predicted_reward = self.__model.predict_state(
                        pre_state)
                    print('Model predicts {0}'.format(next_state))

                # Convert the selected state to a control signal
                next_control_signals = self.__model.state_to_control_signals(
                    next_state, self.__car_client.getCarState())

                # Take the action
                self.__car_controls.steering = next_control_signals[0]
                self.__car_controls.throttle = next_control_signals[1]
                self.__car_controls.brake = next_control_signals[2]
                self.__car_client.setCarControls(self.__car_controls)

                # Wait for a short period of time to see outcome
                time.sleep(wait_delta_sec)

                # Observe outcome and compute reward from action
                state_buffer = self.__append_to_ring_buffer(
                    self.__get_image(), state_buffer, state_buffer_len)
                car_state = self.__car_client.getCarState()
                collision_info = self.__car_client.simGetCollisionInfo()
                reward, far_off = self.__compute_reward(
                    collision_info, car_state)

                # Add the experience to the set of examples from this iteration
                pre_states.append(pre_state)
                post_states.append(state_buffer)
                rewards.append(reward)
                predicted_rewards.append(predicted_reward)
                actions.append(next_state)

        # Only the last state is a terminal state.
        is_not_terminal = [1 for i in range(0, len(actions) - 1, 1)]
        is_not_terminal.append(0)

        # Add all of the states from this iteration to the replay memory
        self.__add_to_replay_memory('pre_states', pre_states)
        self.__add_to_replay_memory('post_states', post_states)
        self.__add_to_replay_memory('actions', actions)
        self.__add_to_replay_memory('rewards', rewards)
        self.__add_to_replay_memory('predicted_rewards', predicted_rewards)
        self.__add_to_replay_memory('is_not_terminal', is_not_terminal)

        # If we are in the main loop, reduce the epsilon parameter so that the model will be called more often
        if not always_random:
            self.__epsilon -= self.__per_iter_epsilon_reduction
            self.__epsilon = max(self.__epsilon, self.__min_epsilon)

        return self.__experiences, len(actions)

    # Adds a set of examples to the replay memory
    def __add_to_replay_memory(self, field_name, data):
        if field_name not in self.__experiences:
            self.__experiences[field_name] = data
        else:
            self.__experiences[field_name] += data
            start_index = max(
                0,
                len(self.__experiences[field_name]) -
                self.__replay_memory_size)
            self.__experiences[field_name] = self.__experiences[field_name][
                start_index:]

    # Sample experiences from the replay memory
    def __sample_experiences(self, experiences, frame_count, sample_randomly):
        sampled_experiences = {}
        sampled_experiences['pre_states'] = []
        sampled_experiences['post_states'] = []
        sampled_experiences['actions'] = []
        sampled_experiences['rewards'] = []
        sampled_experiences['predicted_rewards'] = []
        sampled_experiences['is_not_terminal'] = []

        # Compute the surprise factor, which is the difference between the predicted an the actual Q value for each state.
        # We can use that to weight examples so that we are more likely to train on examples that the model got wrong.
        suprise_factor = np.abs(
            np.array(experiences['rewards'], dtype=np.dtype(float)) -
            np.array(experiences['predicted_rewards'], dtype=np.dtype(float)))
        suprise_factor_normalizer = np.sum(suprise_factor)
        suprise_factor /= float(suprise_factor_normalizer)

        # Generate one minibatch for each frame of the run
        for _ in range(0, frame_count, 1):
            if sample_randomly:
                idx_set = set(
                    np.random.choice(list(range(0, suprise_factor.shape[0],
                                                1)),
                                     size=(self.__batch_size),
                                     replace=False))
            else:
                idx_set = set(
                    np.random.choice(list(range(0, suprise_factor.shape[0],
                                                1)),
                                     size=(self.__batch_size),
                                     replace=False,
                                     p=suprise_factor))

            sampled_experiences['pre_states'] += [
                experiences['pre_states'][i] for i in idx_set
            ]
            sampled_experiences['post_states'] += [
                experiences['post_states'][i] for i in idx_set
            ]
            sampled_experiences['actions'] += [
                experiences['actions'][i] for i in idx_set
            ]
            sampled_experiences['rewards'] += [
                experiences['rewards'][i] for i in idx_set
            ]
            sampled_experiences['predicted_rewards'] += [
                experiences['predicted_rewards'][i] for i in idx_set
            ]
            sampled_experiences['is_not_terminal'] += [
                experiences['is_not_terminal'][i] for i in idx_set
            ]

        return sampled_experiences

    # Train the model on minibatches and post to the trainer node.
    def __publish_batch_and_update_model(self, batches, batches_count):
        # Train and get the gradients
        print(
            'Publishing epoch data and getting latest model from parameter server...'
        )
        gradients = self.__model.get_gradient_update_from_batches(batches)

        # Post the data to the trainer node
        if not self.__local_run:
            post_data = {}
            post_data['gradients'] = gradients
            post_data['batch_count'] = batches_count

            response = requests.post('http://{0}:80/gradient_update'.format(
                self.__trainer_ip_address),
                                     json=post_data)
            print('Response:')
            print(response)

            new_model_parameters = response.json()

            # Update the existing model with the new parameters
            self.__model.from_packet(new_model_parameters)

            #If the trainer sends us a epsilon, allow it to override our local value
            if ('epsilon' in new_model_parameters):
                new_epsilon = float(new_model_parameters['epsilon'])
                print(
                    'Overriding local epsilon with {0}, which was sent from trainer'
                    .format(new_epsilon))
                self.__epsilon = new_epsilon

        else:
            if (self.__num_batches_run > self.__batch_update_frequency +
                    self.__last_checkpoint_batch_count):
                self.__model.update_critic()

                checkpoint = {}
                checkpoint['model'] = self.__model.to_packet(get_target=True)
                checkpoint['batch_count'] = batches_count
                checkpoint_str = json.dumps(checkpoint)

                checkpoint_dir = os.path.join(
                    os.path.join(self.__data_dir, 'checkpoint'),
                    self.__experiment_name)

                if not os.path.isdir(checkpoint_dir):
                    try:
                        os.makedirs(checkpoint_dir)
                    except OSError as e:
                        if e.errno != errno.EEXIST:
                            raise

                file_name = os.path.join(
                    checkpoint_dir, '{0}.json'.format(self.__num_batches_run))
                with open(file_name, 'w') as f:
                    print('Checkpointing to {0}'.format(file_name))
                    f.write(checkpoint_str)

                self.__last_checkpoint_batch_count = self.__num_batches_run

    # Gets the latest model from the trainer node
    def __get_latest_model(self):
        print('Getting latest model from parameter server...')
        response = requests.get('http://{0}:80/latest'.format(
            self.__trainer_ip_address)).json()
        self.__model.from_packet(response)

    # Gets an image from AirSim
    def __get_image(self):
        image_response = self.__car_client.simGetImages(
            [ImageRequest(0, AirSimImageType.Scene, False, False)])[0]
        image1d = np.fromstring(image_response.image_data_uint8,
                                dtype=np.uint8)
        image_rgba = image1d.reshape(108, 256, 4)
        image_resized = image_rgba[49:108, 0:255, 0:3].astype(float)
        return image_resized

    # Computes the reward functinon based on the car position.
    def __compute_reward(self, collision_info, car_state):
        #Define some constant parameters for the reward function
        THRESH_DIST = 20  # The maximum distance from the center of the road to compute the reward function
        DISTANCE_DECAY_RATE = 1.2  # The rate at which the reward decays for the distance function
        CENTER_SPEED_MULTIPLIER = 2.0  # The ratio at which we prefer the distance reward to the speed reward

        # If the car has collided, the reward is always zero
        if (collision_info.has_collided):
            #return 0.0, True
            return -5.0, True

        # If the car is stopped, the reward is always zero
        speed = car_state.speed
        if (speed < 2):
            return 0.0, True

        #Get the car position
        position_key = bytes('position', encoding='utf8')
        x_val_key = bytes('x_val', encoding='utf8')
        y_val_key = bytes('y_val', encoding='utf8')

        #car_point = np.array([car_state.kinematics_true[position_key][x_val_key], car_state.kinematics_true[position_key][y_val_key], 0])
        car_point = np.array([
            car_state.kinematics_estimated.position.x_val,
            car_state.kinematics_estimated.position.y_val, 0
        ])

        # Distance component is exponential distance to nearest line
        distance = 999

        #Compute the distance to the nearest center line
        for line in self.__reward_points:
            local_distance = 0
            length_squared = ((line[0][0] - line[1][0])**2) + (
                (line[0][1] - line[1][1])**2)
            if (length_squared != 0):
                t = max(
                    0,
                    min(
                        1,
                        np.dot(car_point - line[0], line[1] - line[0]) /
                        length_squared))
                proj = line[0] + (t * (line[1] - line[0]))
                local_distance = np.linalg.norm(proj - car_point)
            distance = min(local_distance, distance)

        distance_reward = math.exp(-(distance * DISTANCE_DECAY_RATE))
        if (distance_reward < THRESH_DIST):
            distance_reward *= 10
        return distance_reward, distance > THRESH_DIST

    # Initializes the points used for determining the starting point of the vehicle
    def __init_road_points(self):
        self.__road_points = []
        car_start_coords = [12961.722656, 6660.329102, 0]
        with open(
                os.path.join(os.path.join(self.__data_dir, 'data'),
                             'road_lines.txt'), 'r') as f:
            for line in f:
                points = line.split('\t')
                first_point = np.array(
                    [float(p) for p in points[0].split(',')] + [0])
                second_point = np.array(
                    [float(p) for p in points[1].split(',')] + [0])
                self.__road_points.append(tuple((first_point, second_point)))

        # Points in road_points.txt are in unreal coordinates
        # But car start coordinates are not the same as unreal coordinates
        for point_pair in self.__road_points:
            for point in point_pair:
                point[0] -= car_start_coords[0]
                point[1] -= car_start_coords[1]
                point[0] /= 100
                point[1] /= 100

    # Initializes the points used for determining the optimal position of the vehicle during the reward function
    def __init_reward_points(self):
        self.__reward_points = []
        with open(
                os.path.join(os.path.join(self.__data_dir, 'data'),
                             'reward_points.txt'), 'r') as f:
            for line in f:
                point_values = line.split('\t')
                first_point = np.array(
                    [float(point_values[0]),
                     float(point_values[1]), 0])
                second_point = np.array(
                    [float(point_values[2]),
                     float(point_values[3]), 0])
                self.__reward_points.append(tuple((first_point, second_point)))

    # Randomly selects a starting point on the road
    # Used for initializing an iteration of data generation from AirSim
    def __get_next_starting_point(self):
        car_state = self.__car_client.getCarState()
        random_line_index = np.random.randint(0, high=len(self.__road_points))
        random_interp = (np.random.random_sample() * 0.4) + 0.3
        random_direction_interp = np.random.random_sample()
        random_line = self.__road_points[random_line_index]
        random_start_point = list(random_line[0])
        random_start_point[0] += (random_line[1][0] -
                                  random_line[0][0]) * random_interp
        random_start_point[1] += (random_line[1][1] -
                                  random_line[0][1]) * random_interp

        # Compute the direction that the vehicle will face
        # Vertical line
        if (np.isclose(random_line[0][1], random_line[1][1])):
            if (random_direction_interp > 0.5):
                random_direction = (0, 0, 0)
            else:
                random_direction = (0, 0, math.pi)
        # Horizontal line
        elif (np.isclose(random_line[0][0], random_line[1][0])):
            if (random_direction_interp > 0.5):
                random_direction = (0, 0, math.pi / 2)
            else:
                random_direction = (0, 0, -1.0 * math.pi / 2)

        # The z coordinate is always zero
        random_start_point[2] = -0
        return (random_start_point, random_direction)

    # A helper function to make a directory if it does not exist
    def __make_dir_if_not_exist(self, directory):
        if not (os.path.exists(directory)):
            try:
                os.makedirs(directory)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
class DistributedAgent():
    def __init__(self, parameters):
        required_parameters = [
            'data_dir', 'max_epoch_runtime_sec', 'replay_memory_size',
            'batch_size', 'min_epsilon', 'per_iter_epsilon_reduction',
            'experiment_name', 'train_conv_layers', 'start_x', 'start_y',
            'start_z', 'log_path'
        ]
        for required_parameter in required_parameters:
            if required_parameter not in parameters:
                raise ValueError('Missing required parameter {0}'.format(
                    required_parameter))

        parameters['role_type'] = 'agent'

        print('Starting time: {0}'.format(datetime.datetime.utcnow()),
              file=sys.stderr)
        self.__model_buffer = None
        self.__model = None
        self.__airsim_started = False
        self.__data_dir = parameters['data_dir']
        self.__per_iter_epsilon_reduction = float(
            parameters['per_iter_epsilon_reduction'])
        self.__min_epsilon = float(parameters['min_epsilon'])
        self.__max_epoch_runtime_sec = float(
            parameters['max_epoch_runtime_sec'])
        self.__replay_memory_size = int(parameters['replay_memory_size'])
        self.__batch_size = int(parameters['batch_size'])
        self.__experiment_name = parameters['experiment_name']
        self.__train_conv_layers = bool(
            (parameters['train_conv_layers'].lower().strip() == 'true'))
        self.__epsilon = 1
        self.__num_batches_run = 0
        self.__last_checkpoint_batch_count = 0

        if 'batch_update_frequency' in parameters:
            self.__batch_update_frequency = int(
                parameters['batch_update_frequency'])

        if 'weights_path' in parameters:
            self.__weights_path = parameters['weights_path']
        else:
            self.__weights_path = None

        if 'airsim_path' in parameters:
            self.__airsim_path = parameters['airsim_path']
        else:
            self.__airsim_path = None

        self.__local_run = 'local_run' in parameters

        self.__car_client = None
        self.__car_controls = None

        self.__minibatch_dir = os.path.join(self.__data_dir, 'minibatches')
        self.__output_model_dir = os.path.join(self.__data_dir, 'models')

        self.__make_dir_if_not_exist(self.__minibatch_dir)
        self.__make_dir_if_not_exist(self.__output_model_dir)
        self.__last_model_file = ''

        self.__possible_ip_addresses = []
        self.__trainer_ip_address = None

        self.__experiences = {}

        self.__start_point = [
            float(parameters['start_x']),
            float(parameters['start_y']),
            float(parameters['start_z'])
        ]
        self.__log_file = parameters['log_path']

        # initiate coverage map
        self.__coverage_map = CoverageMap(start_point=self.__start_point,
                                          map_size=12000,
                                          scale_ratio=20,
                                          state_size=6000,
                                          input_size=20,
                                          height_threshold=0.9,
                                          reward_norm=30,
                                          paint_radius=15)

        # create txt file
        if not os.path.isdir(
                os.path.join(self.__data_dir, '\\checkpoint',
                             self.__experiment_name)):
            os.makedirs(
                os.path.join(self.__data_dir, '\\checkpoint',
                             self.__experiment_name))
        self.__rewards_log = open(
            os.path.join(self.__data_dir, '\\checkpoint',
                         self.__experiment_name, "rewards.txt"), "w")
        self.__rewards_log.write("Timestamp\tSum\tMean\n")
        self.__rewards_log.close()

        # create starting points list
        #self.__starting_points = self.__get_starting_points()
        self.__starting_points = [self.__start_point]

    # Starts the agent
    def start(self):
        self.__run_function()

    # The function that will be run during training.
    # It will initialize the connection to the trainer, start AirSim, and continuously run training iterations.
    def __run_function(self):
        print('Starting run function')

        # Once the trainer is online, it will write its IP to a file in (data_dir)\trainer_ip\trainer_ip.txt
        # Wait for that file to exist
        if not self.__local_run:
            print('Waiting for trainer to come online')
            while True:
                trainer_ip_dir = os.path.join(
                    os.path.join(self.__data_dir, 'trainer_ip'),
                    self.__experiment_name)
                print('Checking {0}...'.format(trainer_ip_dir))
                if os.path.isdir(trainer_ip_dir):
                    with open(os.path.join(trainer_ip_dir, 'trainer_ip.txt'),
                              'r') as f:
                        self.__possible_ip_addresses.append(f.read().replace(
                            '\n', ''))
                        break
                print('Not online yet. Sleeping...')
                time.sleep(5)

            # We now have the IP address for the trainer. Attempt to ping the trainer.
            ping_idx = -1
            while True:
                ping_idx += 1
                print('Attempting to ping trainer...')
                try:
                    print('\tPinging {0}...'.format(
                        self.__possible_ip_addresses[ping_idx % len(
                            self.__possible_ip_addresses)]))
                    response = requests.get('http://{0}:80/ping'.format(
                        self.__possible_ip_addresses[ping_idx % len(
                            self.__possible_ip_addresses)])).json()
                    if response['message'] != 'pong':
                        raise ValueError(
                            'Received unexpected message: {0}'.format(
                                response))
                    print('Success!')
                    self.__trainer_ip_address = self.__possible_ip_addresses[
                        ping_idx % len(self.__possible_ip_addresses)]
                    break
                except Exception as e:
                    print('Could not get response. Message is {0}'.format(e))
                    if (ping_idx % len(self.__possible_ip_addresses) == 0):
                        print('Waiting 5 seconds and trying again...')
                        time.sleep(5)

            # Get the latest model from the trainer
            print('Getting model from the trainer')
            sys.stdout.flush()
            buffer_len = 4
            self.__model = RlModel(weights_path=self.__weights_path,
                                   train_conv_layers=self.__train_conv_layers,
                                   buffer_len=buffer_len)
            self.__get_latest_model()

        else:
            print('Run is local. Skipping connection to trainer.')
            buffer_len = 4
            self.__model = RlModel(weights_path=self.__weights_path,
                                   train_conv_layers=self.__train_conv_layers,
                                   buffer_len=buffer_len)

        # Connect to the AirSim exe
        self.__connect_to_airsim()

        # Fill the replay memory by driving randomly.
        print('Filling replay memory...')
        while True:
            print('Running Airsim Epoch.')
            try:
                _, num_of_actions = self.__run_airsim_epoch(True)
                if num_of_actions > 0:
                    percent_full = 100.0 * len(self.__experiences['actions']
                                               ) / self.__replay_memory_size
                    print(
                        'Replay memory now contains {0} members. ({1}% full)'.
                        format(len(self.__experiences['actions']),
                               percent_full))

                    if (percent_full >= 100.0):
                        break
            except msgpackrpc.error.TimeoutError:
                print(
                    'Lost connection to AirSim while filling replay memory. Attempting to reconnect.'
                )
                self.__connect_to_airsim()

        # Get the latest model. Other agents may have finished before us.
        print('Replay memory filled. Starting main loop...')

        if not self.__local_run:
            self.__get_latest_model()
        while True:
            try:
                if (self.__model is not None):

                    #Generate a series of training examples by driving the vehicle in AirSim
                    print('Running Airsim Epoch.')
                    experiences, frame_count = self.__run_airsim_epoch(False)

                    # If we didn't immediately crash, train on the gathered experiences
                    if (frame_count > 0):
                        print('Generating {0} minibatches...'.format(
                            frame_count))

                        print('Sampling Experiences.')
                        # Sample experiences from the replay memory
                        sampled_experiences = self.__sample_experiences(
                            experiences, frame_count, True)

                        self.__num_batches_run += frame_count

                        # If we successfully sampled, train on the collected minibatches and send the gradients to the trainer node
                        if (len(sampled_experiences) > 0):
                            print('Publishing AirSim Epoch.')

                            # write all rewards to log file
                            self.__rewards_log = open(
                                os.path.join(self.__data_dir, 'checkpoint',
                                             self.__experiment_name,
                                             "rewards.txt"), "a+")
                            rewards_sum = 0
                            for reward in sampled_experiences['rewards']:
                                rewards_sum += reward
                            self.__rewards_log.write("{}\t{}\t{}\n".format(
                                time.time(), rewards_sum, rewards_sum /
                                len(sampled_experiences['rewards'])))
                            self.__rewards_log.close()

                            self.__publish_batch_and_update_model(
                                sampled_experiences, frame_count)

            # Occasionally, the AirSim exe will stop working.
            # For example, if a user connects to the node to visualize progress.
            # In that case, attempt to reconnect.
            except msgpackrpc.error.TimeoutError:
                print('Lost connection to AirSim. Attempting to reconnect.')
                self.__connect_to_airsim()

    # Connects to the AirSim Exe.
    # Assume that it is already running. After 10 successive attempts, attempt to restart the executable.
    def __connect_to_airsim(self):
        attempt_count = 0
        while True:
            try:
                print('Attempting to connect to AirSim (attempt {0})'.format(
                    attempt_count))
                self.__car_client = airsim.CarClient()
                self.__car_client.confirmConnection()
                self.__car_client.enableApiControl(True)
                self.__car_controls = airsim.CarControls()
                self.__coverage_map.set_client(
                    client=self.__car_client)  # update client on coverage map
                print('Connected!')
                return
            except:
                print('Failed to connect.')
                attempt_count += 1
                if (attempt_count % 10 == 0):
                    print(
                        '10 consecutive failures to connect. Attempting to start AirSim on my own.'
                    )

                    if self.__local_run:
                        os.system('START "" powershell.exe {0}'.format(
                            os.path.join(
                                self.__airsim_path,
                                'AD_Cookbook_Start_AirSim.ps1 neighborhood -windowed'
                            )))
                    else:
                        os.system(
                            'START "" powershell.exe D:\\AD_Cookbook_AirSim\\Scripts\\DistributedRL\\restart_airsim_if_agent.ps1'
                        )
                print('Waiting a few seconds.')
                time.sleep(10)

    # Appends a sample to a ring buffer.
    # If the appended example takes the size of the buffer over buffer_size, the example at the front will be removed.
    def __append_to_ring_buffer(self, item, buffer, buffer_size):
        if (len(buffer) >= buffer_size):
            buffer = buffer[1:]
        buffer.append(item)
        return buffer

    # Runs an interation of data generation from AirSim.
    # Data will be saved in the replay memory.
    def __run_airsim_epoch(self, always_random):
        print('Running AirSim epoch.')

        # reset coverage map
        self.__coverage_map.reset()

        # Pick a random starting point on the roads
        starting_points, starting_direction = self.__get_next_starting_point()

        # Initialize the state buffer.
        # For now, save 4 images at 0.01 second intervals.
        state_buffer_len = 4
        state_buffer = []
        wait_delta_sec = 0.01

        print('Getting Pose')
        self.__car_client.simSetVehiclePose(
            airsim.Pose(
                airsim.Vector3r(starting_points[0], starting_points[1],
                                starting_points[2]),
                toQuaternion(starting_direction[0], starting_direction[1],
                             starting_direction[2])), True)

        # Currently, simSetVehiclePose does not allow us to set the velocity.
        # So, if we crash and call simSetVehiclePose, the car will be still moving at its previous velocity.
        # We need the car to stop moving, so push the brake and wait for a few seconds.
        print('Waiting for momentum to die')
        self.__car_controls.steering = 0
        self.__car_controls.throttle = 0
        self.__car_controls.brake = 1
        self.__car_client.setCarControls(self.__car_controls)
        time.sleep(4)

        print('Resetting')
        self.__car_client.simSetVehiclePose(
            airsim.Pose(
                airsim.Vector3r(starting_points[0], starting_points[1],
                                starting_points[2]),
                toQuaternion(starting_direction[0], starting_direction[1],
                             starting_direction[2])), True)

        #Start the car rolling so it doesn't get stuck
        print('Running car for a few seconds...')
        self.__car_controls.steering = 0
        self.__car_controls.throttle = 0.4
        self.__car_controls.brake = 0
        self.__car_client.setCarControls(self.__car_controls)

        # While the car is rolling, start initializing the state buffer
        stop_run_time = datetime.datetime.now() + datetime.timedelta(seconds=1)
        while (datetime.datetime.now() < stop_run_time):
            time.sleep(wait_delta_sec)
            image, _ = self.__get_image()
            state_buffer = self.__append_to_ring_buffer(
                image, state_buffer, state_buffer_len)
        done = False
        actions = []  #records the state we go to
        pre_states = []
        post_states = []
        rewards = []
        predicted_rewards = []
        car_state = self.__car_client.getCarState()

        # slow down a bit
        self.__car_controls.throttle = 0.3
        self.__car_client.setCarControls(self.__car_controls)

        start_time = datetime.datetime.utcnow()
        end_time = start_time + datetime.timedelta(
            seconds=self.__max_epoch_runtime_sec)

        num_random = 0

        # Main data collection loop
        while not done:
            collision_info = self.__car_client.simGetCollisionInfo()
            utc_now = datetime.datetime.utcnow()

            # Check for terminal conditions:
            # 1) Car has collided
            # 2) Car is stopped
            # 3) The run has been running for longer than max_epoch_runtime_sec.
            #       This constraint is so the model doesn't end up having to churn through huge chunks of data, slowing down training
            if (collision_info.has_collided or abs(car_state.speed) < 0.02
                    or utc_now > end_time):
                print('Start time: {0}, end time: {1}'.format(
                    start_time, utc_now),
                      file=sys.stderr)
                if (utc_now > end_time):
                    print('timed out.')
                    print(
                        'Full autonomous run finished at {0}'.format(utc_now),
                        file=sys.stderr)
                done = True
                sys.stderr.flush()
            else:

                # The Agent should occasionally pick random action instead of best action
                do_greedy = np.random.random_sample()
                pre_state = copy.deepcopy(state_buffer)
                if (do_greedy < self.__epsilon or always_random):
                    num_random += 1
                    next_state = self.__model.get_random_state()
                    predicted_reward = 0
                else:
                    next_state, predicted_reward, _ = self.__model.predict_state(
                        pre_state)

                    print('Model predicts {0}'.format(next_state))

                # Convert the selected state to a control signal
                next_steering, next_brake = self.__model.state_to_control_signals(
                    next_state, self.__car_client.getCarState())

                # Take the action
                self.__car_controls.steering = next_steering
                self.__car_controls.brake = next_brake
                self.__car_client.setCarControls(self.__car_controls)

                # Wait for a short period of time to see outcome
                time.sleep(wait_delta_sec)

                # Observe outcome and compute reward from action
                post_image, cov_reward = self.__get_image()

                state_buffer = self.__append_to_ring_buffer(
                    post_image, state_buffer, state_buffer_len)
                car_state = self.__car_client.getCarState()
                collision_info = self.__car_client.simGetCollisionInfo()
                reward = self.__compute_reward(collision_info, car_state,
                                               cov_reward, next_state)

                # Add the experience to the set of examples from this iteration
                pre_states.append(pre_state)
                post_states.append(state_buffer)
                rewards.append(reward)
                predicted_rewards.append(predicted_reward)
                actions.append(next_state)

        # Only the last state is a terminal state.
        is_not_terminal = [1 for i in range(0, len(actions) - 1, 1)]
        is_not_terminal.append(0)

        # only add to the replay memory if have enough data
        if len(actions) > 30:
            # Add all of the states from this iteration to the replay memory
            self.__add_to_replay_memory('pre_states', pre_states)
            self.__add_to_replay_memory('post_states', post_states)
            self.__add_to_replay_memory('actions', actions)
            self.__add_to_replay_memory('rewards', rewards)
            self.__add_to_replay_memory('predicted_rewards', predicted_rewards)
            self.__add_to_replay_memory('is_not_terminal', is_not_terminal)

            print('Percent random actions: {0}'.format(num_random /
                                                       max(1, len(actions))))
            print('Num total actions: {0}'.format(len(actions)))

            # If we are in the main loop, reduce the epsilon parameter so that the model will be called more often
            # Note: this will be overwritten by the trainer's epsilon if running in distributed mode
            if not always_random:
                self.__epsilon -= self.__per_iter_epsilon_reduction
                self.__epsilon = max(self.__epsilon, self.__min_epsilon)

            return self.__experiences, len(actions)
        else:
            return self.__experiences, 0

    # Adds a set of examples to the replay memory
    def __add_to_replay_memory(self, field_name, data):
        if field_name not in self.__experiences:
            self.__experiences[field_name] = data
        else:
            self.__experiences[field_name] += data
            start_index = max(
                0,
                len(self.__experiences[field_name]) -
                self.__replay_memory_size)
            self.__experiences[field_name] = self.__experiences[field_name][
                start_index:]

    # Sample experiences from the replay memory
    def __sample_experiences(self, experiences, frame_count, sample_randomly):
        sampled_experiences = {}
        sampled_experiences['pre_states'] = []
        sampled_experiences['post_states'] = []
        sampled_experiences['actions'] = []
        sampled_experiences['rewards'] = []
        sampled_experiences['predicted_rewards'] = []
        sampled_experiences['is_not_terminal'] = []

        # Compute the surprise factor, which is the difference between the predicted an the actual Q value for each state.
        # We can use that to weight examples so that we are more likely to train on examples that the model got wrong.
        suprise_factor = np.abs(
            np.array(experiences['rewards'], dtype=np.dtype(float)) -
            np.array(experiences['predicted_rewards'], dtype=np.dtype(float)))
        suprise_factor_normalizer = np.sum(suprise_factor)
        suprise_factor /= float(suprise_factor_normalizer)

        # Generate one minibatch for each frame of the run
        for _ in range(0, frame_count, 1):
            if sample_randomly:
                idx_set = set(
                    np.random.choice(list(range(0, suprise_factor.shape[0],
                                                1)),
                                     size=(self.__batch_size),
                                     replace=False))
            else:
                idx_set = set(
                    np.random.choice(list(range(0, suprise_factor.shape[0],
                                                1)),
                                     size=(self.__batch_size),
                                     replace=False,
                                     p=suprise_factor))

            sampled_experiences['pre_states'] += [
                experiences['pre_states'][i] for i in idx_set
            ]
            sampled_experiences['post_states'] += [
                experiences['post_states'][i] for i in idx_set
            ]
            sampled_experiences['actions'] += [
                experiences['actions'][i] for i in idx_set
            ]
            sampled_experiences['rewards'] += [
                experiences['rewards'][i] for i in idx_set
            ]
            sampled_experiences['predicted_rewards'] += [
                experiences['predicted_rewards'][i] for i in idx_set
            ]
            sampled_experiences['is_not_terminal'] += [
                experiences['is_not_terminal'][i] for i in idx_set
            ]

        return sampled_experiences

    # Train the model on minibatches and post to the trainer node.
    # The trainer node will respond with the latest version of the model that will be used in further data generation iterations.
    def __publish_batch_and_update_model(self, batches, batches_count):
        # Train and get the gradients
        print(
            'Publishing epoch data and getting latest model from parameter server...'
        )
        gradients = self.__model.get_gradient_update_from_batches(batches)

        # Post the data to the trainer node
        if not self.__local_run:
            post_data = {}
            post_data['gradients'] = gradients
            post_data['batch_count'] = batches_count

            response = requests.post('http://{0}:80/gradient_update'.format(
                self.__trainer_ip_address),
                                     json=post_data)
            print('Response:')
            print(response)

            new_model_parameters = response.json()

            # Update the existing model with the new parameters
            self.__model.from_packet(new_model_parameters)

            #If the trainer sends us a epsilon, allow it to override our local value
            if ('epsilon' in new_model_parameters):
                new_epsilon = float(new_model_parameters['epsilon'])
                print(
                    'Overriding local epsilon with {0}, which was sent from trainer'
                    .format(new_epsilon))
                self.__epsilon = new_epsilon

        else:
            if (self.__num_batches_run > self.__batch_update_frequency +
                    self.__last_checkpoint_batch_count):
                self.__model.update_critic()

                checkpoint = {}
                checkpoint['model'] = self.__model.to_packet(get_target=True)
                checkpoint['batch_count'] = batches_count
                checkpoint_str = json.dumps(checkpoint)

                checkpoint_dir = os.path.join(
                    os.path.join(self.__data_dir, 'checkpoint'),
                    self.__experiment_name)

                if not os.path.isdir(checkpoint_dir):
                    try:
                        os.makedirs(checkpoint_dir)
                    except OSError as e:
                        if e.errno != errno.EEXIST:
                            raise

                file_name = os.path.join(
                    checkpoint_dir, '{0}.json'.format(self.__num_batches_run))
                with open(file_name, 'w') as f:
                    print('Checkpointing to {0}'.format(file_name))
                    f.write(checkpoint_str)

                self.__last_checkpoint_batch_count = self.__num_batches_run

    # Gets the latest model from the trainer node
    def __get_latest_model(self):
        print('Getting latest model from parameter server...')
        response = requests.get('http://{0}:80/latest'.format(
            self.__trainer_ip_address)).json()
        self.__model.from_packet(response)

    # Gets a coverage image from AirSim
    def __get_cov_image(self):

        state, cov_reward = self.__coverage_map.get_state_from_pose()

        # debug only
        #im = Image.fromarray(np.uint8(state))
        #im.save("DistributedRL\\debug\\{}.png".format(time.time()))

        # normalize state
        state = state / 255.0

        return state, cov_reward

    # Gets an image from AirSim
    def __get_image(self):

        responses = self.__car_client.simGetImages([
            airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective,
                                True, False)
        ])
        img1d = np.array(responses[0].image_data_float, dtype=np.float)

        if img1d.size > 1:

            img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
            img2d = np.reshape(img1d,
                               (responses[0].height, responses[0].width))

            image = Image.fromarray(img2d)

            # debug only
            #image_png = image.convert('RGB')
            #image_png.save("DistributedRL\\debug\\{}.png".format(time.time()))

            depth_im = np.array(image.resize((84, 84)).convert('L'))
            depth_im = depth_im / 255.0
        else:
            depth_im = np.zeros((84, 84)).astype(float)

        cov_im, cov_reward = self.__get_cov_image()
        depth_im[:cov_im.shape[0], :cov_im.shape[1]] = cov_im

        #image = Image.fromarray(depth_im)
        #image.save("DistributedRL\\debug\\{}.png".format(time.time()))

        return depth_im, cov_reward
        """
        image_response = self.__car_client.simGetImages([airsim.ImageRequest("RCCamera", airsim.ImageType.Scene, False, False)])[0]
        image1d = np.fromstring(image_response.image_data_uint8, dtype=np.uint8)
        if image1d.size > 1:
            image_rgba = image1d.reshape(image_response.height, image_response.width, 4)

            #im = Image.fromarray(np.uint8(image_rgba))
            #im.save("DistributedRL\\debug\\{}.png".format(time.time()))

            image_rgba = image_rgba / 255.0

            return image_rgba[60:144,86:170,0:3].astype(float)
        
        return np.zeros((84,84,3)).astype(float)
        """

    # Computes the reward functinon based on collision.
    def __compute_reward(self, collision_info, car_state, cov_reward, action):

        alpha = 1.0

        # If the car has collided, the reward is always zero
        if (collision_info.has_collided):
            return 0.0

        # If the car has stopped for some reason, the reward is always zero
        if abs(car_state.speed) < 0.02:
            return 0.0

        # If there is no new coverage, there is no reward
        if cov_reward < 0.1:
            return 0.0

        # straight will be rewarded as 1.0, semi straight as 0.5
        direction_reward = float(2 - abs(action - 2)) / 2.0

        # final reward
        reward = alpha * cov_reward + (1 - alpha) * direction_reward

        #print("cov reward: {}, reward: {}".format(cov_reward, reward))
        return reward

    # prepare starting points list
    def __get_starting_points(self):

        starting_points_file = open(
            os.path.join(self.__data_dir, 'data\\starting_points.txt'))
        starting_points_list = []

        for line in starting_points_file:
            starting_points_list.append(
                [float(x) for x in line.split(' ')[:3]])

        return starting_points_list

    # get most newly generated random point
    def __get_next_generated_random_point(self):
        """
        # grab the newest line with generated random point
        newest_rp = "None"

        # keep searching until the simulation is giving something
        while newest_rp == "None":
            
            # notify user
            print("Searching for a random point...")

            # open log file
            log_file = open(self.__log_file, "r")

            # search for the newest generated random point line
            for line in log_file:
                if "RandomPoint" in line:
                    newest_rp = line
            
        # notify user
        print("Found random point.")
        
        # filter random point from line
        random_point = [float(newest_rp.split(" ")[-3].split("=")[1]), float(newest_rp.split(" ")[-2].split("=")[1]), float(newest_rp.split(" ")[-1].split("=")[1])]
        return random_point
        """
        idx = randint(0, len(self.__starting_points) - 1)
        return self.__starting_points[idx]

    # Randomly selects a starting point on the road
    # Used for initializing an iteration of data generation from AirSim
    def __get_next_starting_point(self):

        # get random start point from log file, and make it relative to agent's starting point
        random_start_point = self.__get_next_generated_random_point()
        random_start_point = [
            random_start_point[0] - self.__start_point[0],
            random_start_point[1] - self.__start_point[1],
            random_start_point[2] - self.__start_point[2]
        ]
        random_start_point = [x / 100.0 for x in random_start_point]

        # draw random orientation
        #random_direction = (0, 0, np.random.uniform(-math.pi,math.pi))
        random_direction = (0, 0, 0)

        # Get the current state of the vehicle
        car_state = self.__car_client.getCarState()

        # The z coordinate is always zero
        random_start_point[2] = -0
        return (random_start_point, random_direction)

    # A helper function to make a directory if it does not exist
    def __make_dir_if_not_exist(self, directory):
        if not (os.path.exists(directory)):
            try:
                os.makedirs(directory)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
class DistributedAgent():
    def __init__(self, parameters):
        required_parameters = ['data_dir', 'max_epoch_runtime_sec', 'replay_memory_size', 'batch_size', 'min_epsilon', 'per_iter_epsilon_reduction', 'experiment_name', 'train_conv_layers']
        for required_parameter in required_parameters:
            if required_parameter not in parameters:
                raise ValueError('Missing required parameter {0}'.format(required_parameter))

        parameters['role_type'] = 'agent'

        self.__model_buffer = None
        self.__model = None
        self.__airsim_started = False
        self.__data_dir = parameters['data_dir']
        self.__per_iter_epsilon_reduction = float(parameters['per_iter_epsilon_reduction'])
        self.__min_epsilon = float(parameters['min_epsilon'])
        self.__max_epoch_runtime_sec = float(parameters['max_epoch_runtime_sec'])
        self.__replay_memory_size = int(parameters['replay_memory_size'])
        self.__batch_size = int(parameters['batch_size'])
        self.__experiment_name = parameters['experiment_name']
        self.__train_conv_layers = bool(parameters['train_conv_layers'])
        self.__epsilon = 1
        self.__num_batches_run = 0
        self.__last_checkpoint_batch_count = 0
        
        if 'batch_update_frequency' in parameters:
            self.__batch_update_frequency = int(parameters['batch_update_frequency'])
        
        if 'weights_path' in parameters:
            self.__weights_path = parameters['weights_path']
        else:
            self.__weights_path = None
            
        if 'airsim_path' in parameters:
            self.__airsim_path = parameters['airsim_path']
        else:
            self.__airsim_path = None

        self.__local_run = 'local_run' in parameters

        self.__car_client = None
        self.__car_controls = None

        self.__minibatch_dir = os.path.join(self.__data_dir, 'minibatches')
        self.__output_model_dir = os.path.join(self.__data_dir, 'models')

        self.__make_dir_if_not_exist(self.__minibatch_dir)
        self.__make_dir_if_not_exist(self.__output_model_dir)
        self.__last_model_file = ''

        self.__possible_ip_addresses = []
        self.__trainer_ip_address = None

        self.__experiences = {}

        self.__init_road_points()
        self.__init_reward_points()

    # Starts the agent
    def start(self):
        self.__run_function()

    # The function that will be run during training.
    # It will initialize the connection to the trainer, start AirSim, and continuously run training iterations.
    def __run_function(self):
        print('Starting run function')
        
        # Once the trainer is online, it will write its IP to a file in (data_dir)\trainer_ip\trainer_ip.txt
        # Wait for that file to exist
        if not self.__local_run:
            print('Waiting for trainer to come online')
            while True:
                trainer_ip_dir = os.path.join(os.path.join(self.__data_dir, 'trainer_ip'), self.__experiment_name)
                print('Checking {0}...'.format(trainer_ip_dir))
                if os.path.isdir(trainer_ip_dir):
                    with open(os.path.join(trainer_ip_dir, 'trainer_ip.txt'), 'r') as f:
                        self.__possible_ip_addresses.append(f.read().replace('\n', ''))
                        break
                print('Not online yet. Sleeping...')
                time.sleep(5)
        
            # We now have the IP address for the trainer. Attempt to ping the trainer.
            ping_idx = -1
            while True:
                ping_idx += 1
                print('Attempting to ping trainer...')
                try:
                    print('\tPinging {0}...'.format(self.__possible_ip_addresses[ping_idx % len(self.__possible_ip_addresses)]))
                    response = requests.get('http://{0}:80/ping'.format(self.__possible_ip_addresses[ping_idx % len(self.__possible_ip_addresses)])).json()
                    if response['message'] != 'pong':
                        raise ValueError('Received unexpected message: {0}'.format(response))
                    print('Success!')
                    self.__trainer_ip_address = self.__possible_ip_addresses[ping_idx % len(self.__possible_ip_addresses)]
                    break
                except Exception as e:
                    print('Could not get response. Message is {0}'.format(e))
                    if (ping_idx % len(self.__possible_ip_addresses) == 0):
                        print('Waiting 5 seconds and trying again...')
                        time.sleep(5)

            # Get the latest model from the trainer
            print('Getting model from the trainer')
            sys.stdout.flush()
            self.__model = RlModel(self.__weights_path, self.__train_conv_layers)
            self.__get_latest_model()
        
        else:
            print('Run is local. Skipping connection to trainer.')
            self.__model = RlModel(self.__weights_path, self.__train_conv_layers)
            
            
        # Connect to the AirSim exe
        self.__connect_to_airsim()

        # Fill the replay memory by driving randomly.
        print('Filling replay memory...')
        while True:
            print('Running Airsim Epoch.')
            try:
                self.__run_airsim_epoch(True)
                percent_full = 100.0 * len(self.__experiences['actions'])/self.__replay_memory_size
                print('Replay memory now contains {0} members. ({1}% full)'.format(len(self.__experiences['actions']), percent_full))

                if (percent_full >= 100.0):
                    break
            except msgpackrpc.error.TimeoutError:
                print('Lost connection to AirSim while fillling replay memory. Attempting to reconnect.')
                self.__connect_to_airsim()
            
        # Get the latest model. Other agents may have finished before us.
        print('Replay memory filled. Starting main loop...')
        
        if not self.__local_run:
            self.__get_latest_model()
        while True:
            try:
                if (self.__model is not None):

                    #Generate a series of training examples by driving the vehicle in AirSim
                    print('Running Airsim Epoch.')
                    experiences, frame_count = self.__run_airsim_epoch(False)

                    # If we didn't immediately crash, train on the gathered experiences
                    if (frame_count > 0):
                        print('Generating {0} minibatches...'.format(frame_count))

                        print('Sampling Experiences.')
                        # Sample experiences from the replay memory
                        sampled_experiences = self.__sample_experiences(experiences, frame_count, True)

                        self.__num_batches_run += frame_count
                        
                        # If we successfully sampled, train on the collected minibatches and send the gradients to the trainer node
                        if (len(sampled_experiences) > 0):
                            print('Publishing AirSim Epoch.')
                            self.__publish_batch_and_update_model(sampled_experiences, frame_count)
            
            # Occasionally, the AirSim exe will stop working.
            # For example, if a user connects to the node to visualize progress.
            # In that case, attempt to reconnect.
            except msgpackrpc.error.TimeoutError:
                print('Lost connection to AirSim. Attempting to reconnect.')
                self.__connect_to_airsim()

    # Connects to the AirSim Exe.
    # Assume that it is already running. After 10 successive attempts, attempt to restart the executable.
    def __connect_to_airsim(self):
        attempt_count = 0
        while True:
            try:
                print('Attempting to connect to AirSim (attempt {0})'.format(attempt_count))
                self.__car_client = CarClient()
                self.__car_client.confirmConnection()
                self.__car_client.enableApiControl(True)
                self.__car_controls = CarControls()
                print('Connected!')
                return
            except:
                print('Failed to connect.')
                attempt_count += 1
                if (attempt_count % 10 == 0):
                    print('10 consecutive failures to connect. Attempting to start AirSim on my own.')
                    
                    if self.__local_run:
                        os.system('START "" powershell.exe {0}'.format(os.path.join(self.__airsim_path, 'AD_Cookbook_Start_AirSim.ps1 neighborhood -windowed')))
                    else:
                        os.system('START "" powershell.exe D:\\AD_Cookbook_AirSim\\Scripts\\DistributedRL\\restart_airsim_if_agent.ps1')
                print('Waiting a few seconds.')
                time.sleep(10)

    # Appends a sample to a ring buffer.
    # If the appended example takes the size of the buffer over buffer_size, the example at the front will be removed.
    def __append_to_ring_buffer(self, item, buffer, buffer_size):
        if (len(buffer) >= buffer_size):
            buffer = buffer[1:]
        buffer.append(item)
        return buffer
    
    # Runs an interation of data generation from AirSim.
    # Data will be saved in the replay memory.
    def __run_airsim_epoch(self, always_random):
        print('Running AirSim epoch.')
        
        # Pick a random starting point on the roads
        starting_points, starting_direction = self.__get_next_starting_point()
        
        # Initialize the state buffer.
        # For now, save 4 images at 0.01 second intervals.
        state_buffer_len = 4
        state_buffer = []
        wait_delta_sec = 0.01

        print('Getting Pose')
        self.__car_client.simSetPose(Pose(Vector3r(starting_points[0], starting_points[1], starting_points[2]), AirSimClientBase.toQuaternion(starting_direction[0], starting_direction[1], starting_direction[2])), True)

        # Currently, simSetPose does not allow us to set the velocity. 
        # So, if we crash and call simSetPose, the car will be still moving at its previous velocity.
        # We need the car to stop moving, so push the brake and wait for a few seconds.
        print('Waiting for momentum to die')
        self.__car_controls.steering = 0
        self.__car_controls.throttle = 0
        self.__car_controls.brake = 1
        self.__car_client.setCarControls(self.__car_controls)
        time.sleep(4)
        
        print('Resetting')
        self.__car_client.simSetPose(Pose(Vector3r(starting_points[0], starting_points[1], starting_points[2]), AirSimClientBase.toQuaternion(starting_direction[0], starting_direction[1], starting_direction[2])), True)

        #Start the car rolling so it doesn't get stuck
        print('Running car for a few seconds...')
        self.__car_controls.steering = 0
        self.__car_controls.throttle = 1
        self.__car_controls.brake = 0
        self.__car_client.setCarControls(self.__car_controls)
        
        # While the car is rolling, start initializing the state buffer
        stop_run_time =datetime.datetime.now() + datetime.timedelta(seconds=2)
        while(datetime.datetime.now() < stop_run_time):
            time.sleep(wait_delta_sec)
            state_buffer = self.__append_to_ring_buffer(self.__get_image(), state_buffer, state_buffer_len)
        done = False
        actions = [] #records the state we go to
        pre_states = []
        post_states = []
        rewards = []
        predicted_rewards = []
        car_state = self.__car_client.getCarState()

        start_time = datetime.datetime.utcnow()
        end_time = start_time + datetime.timedelta(seconds=self.__max_epoch_runtime_sec)
        
        num_random = 0
        far_off = False
        
        # Main data collection loop
        while not done:
            collision_info = self.__car_client.getCollisionInfo()
            utc_now = datetime.datetime.utcnow()
            
            # Check for terminal conditions:
            # 1) Car has collided
            # 2) Car is stopped
            # 3) The run has been running for longer than max_epoch_runtime_sec. 
            #       This constraint is so the model doesn't end up having to churn through huge chunks of data, slowing down training
            # 4) The car has run off the road
            if (collision_info.has_collided or car_state.speed < 2 or utc_now > end_time or far_off):
                if (utc_now > end_time):
                    print('timed out.')
                done = True
            else:

                # The Agent should occasionally pick random action instead of best action
                do_greedy = np.random.random_sample()
                pre_state = copy.deepcopy(state_buffer)
                if (do_greedy < self.__epsilon or always_random):
                    num_random += 1
                    next_state = self.__model.get_random_state()
                    predicted_reward = 0
                    
                else:
                    next_state, predicted_reward = self.__model.predict_state(pre_state)
                    print('Model predicts {0}'.format(next_state))

                # Convert the selected state to a control signal
                next_control_signals = self.__model.state_to_control_signals(next_state, self.__car_client.getCarState())

                # Take the action
                self.__car_controls.steering = next_control_signals[0]
                self.__car_controls.throttle = next_control_signals[1]
                self.__car_controls.brake = next_control_signals[2]
                self.__car_client.setCarControls(self.__car_controls)
                
                # Wait for a short period of time to see outcome
                time.sleep(wait_delta_sec)

                # Observe outcome and compute reward from action
                state_buffer = self.__append_to_ring_buffer(self.__get_image(), state_buffer, state_buffer_len)
                car_state = self.__car_client.getCarState()
                collision_info = self.__car_client.getCollisionInfo()
                reward, far_off = self.__compute_reward(collision_info, car_state)
                
                # Add the experience to the set of examples from this iteration
                pre_states.append(pre_state)
                post_states.append(state_buffer)
                rewards.append(reward)
                predicted_rewards.append(predicted_reward)
                actions.append(next_state)

        # Only the last state is a terminal state.
        is_not_terminal = [1 for i in range(0, len(actions)-1, 1)]
        is_not_terminal.append(0)
        
        # Add all of the states from this iteration to the replay memory
        self.__add_to_replay_memory('pre_states', pre_states)
        self.__add_to_replay_memory('post_states', post_states)
        self.__add_to_replay_memory('actions', actions)
        self.__add_to_replay_memory('rewards', rewards)
        self.__add_to_replay_memory('predicted_rewards', predicted_rewards)
        self.__add_to_replay_memory('is_not_terminal', is_not_terminal)

        print('Percent random actions: {0}'.format(num_random / max(1, len(actions))))
        print('Num total actions: {0}'.format(len(actions)))
        
        # If we are in the main loop, reduce the epsilon parameter so that the model will be called more often
        if not always_random:
            self.__epsilon -= self.__per_iter_epsilon_reduction
            self.__epsilon = max(self.__epsilon, self.__min_epsilon)
        
        return self.__experiences, len(actions)

    # Adds a set of examples to the replay memory
    def __add_to_replay_memory(self, field_name, data):
        if field_name not in self.__experiences:
            self.__experiences[field_name] = data
        else:
            self.__experiences[field_name] += data
            start_index = max(0, len(self.__experiences[field_name]) - self.__replay_memory_size)
            self.__experiences[field_name] = self.__experiences[field_name][start_index:]

    # Sample experiences from the replay memory
    def __sample_experiences(self, experiences, frame_count, sample_randomly):
        sampled_experiences = {}
        sampled_experiences['pre_states'] = []
        sampled_experiences['post_states'] = []
        sampled_experiences['actions'] = []
        sampled_experiences['rewards'] = []
        sampled_experiences['predicted_rewards'] = []
        sampled_experiences['is_not_terminal'] = []

        # Compute the surprise factor, which is the difference between the predicted an the actual Q value for each state.
        # We can use that to weight examples so that we are more likely to train on examples that the model got wrong.
        suprise_factor = np.abs(np.array(experiences['rewards'], dtype=np.dtype(float)) - np.array(experiences['predicted_rewards'], dtype=np.dtype(float)))
        suprise_factor_normalizer = np.sum(suprise_factor)
        suprise_factor /= float(suprise_factor_normalizer)

        # Generate one minibatch for each frame of the run
        for _ in range(0, frame_count, 1):
            if sample_randomly:
                idx_set = set(np.random.choice(list(range(0, suprise_factor.shape[0], 1)), size=(self.__batch_size), replace=False))
            else:
                idx_set = set(np.random.choice(list(range(0, suprise_factor.shape[0], 1)), size=(self.__batch_size), replace=False, p=suprise_factor))
        
            sampled_experiences['pre_states'] += [experiences['pre_states'][i] for i in idx_set]
            sampled_experiences['post_states'] += [experiences['post_states'][i] for i in idx_set]
            sampled_experiences['actions'] += [experiences['actions'][i] for i in idx_set]
            sampled_experiences['rewards'] += [experiences['rewards'][i] for i in idx_set]
            sampled_experiences['predicted_rewards'] += [experiences['predicted_rewards'][i] for i in idx_set]
            sampled_experiences['is_not_terminal'] += [experiences['is_not_terminal'][i] for i in idx_set]
            
        return sampled_experiences
        
     
    # Train the model on minibatches and post to the trainer node.
    # The trainer node will respond with the latest version of the model that will be used in further data generation iterations.
    def __publish_batch_and_update_model(self, batches, batches_count):
        # Train and get the gradients
        print('Publishing epoch data and getting latest model from parameter server...')
        gradients = self.__model.get_gradient_update_from_batches(batches)
        
        # Post the data to the trainer node
        if not self.__local_run:
            post_data = {}
            post_data['gradients'] = gradients
            post_data['batch_count'] = batches_count
            
            new_model_parameters = requests.post('http://{0}:80/gradient_update'.format(self.__trainer_ip_address), json=post_data)
            print('New params:')
            print(new_model_parameters)
            
            # Update the existing model with the new parameters
            self.__model.from_packet(new_model_parameters.json())
        else:
            if (self.__num_batches_run > self.__batch_update_frequency + self.__last_checkpoint_batch_count):
                self.__model.update_critic()
                
                checkpoint = {}
                checkpoint['model'] = self.__model.to_packet(get_target=True)
                checkpoint['batch_count'] = batches_count
                checkpoint_str = json.dumps(checkpoint)

                checkpoint_dir = os.path.join(os.path.join(self.__data_dir, 'checkpoint'), self.__experiment_name)
                
                if not os.path.isdir(checkpoint_dir):
                    try:
                        os.makedirs(checkpoint_dir)
                    except OSError as e:
                        if e.errno != errno.EEXIST:
                            raise
                            
                file_name = os.path.join(checkpoint_dir,'{0}.json'.format(self.__num_batches_run)) 
                with open(file_name, 'w') as f:
                    print('Checkpointing to {0}'.format(file_name))
                    f.write(checkpoint_str)
                
                self.__last_checkpoint_batch_count = self.__num_batches_run
                
    # Gets the latest model from the trainer node
    def __get_latest_model(self):
        print('Getting latest model from parameter server...')
        response = requests.get('http://{0}:80/latest'.format(self.__trainer_ip_address)).json()
        self.__model.from_packet(response)

    # Gets an image from AirSim
    def __get_image(self):
        image_response = self.__car_client.simGetImages([ImageRequest(0, AirSimImageType.Scene, False, False)])[0]
        image1d = np.fromstring(image_response.image_data_uint8, dtype=np.uint8)
        image_rgba = image1d.reshape(image_response.height, image_response.width, 4)

        return image_rgba[76:135,0:255,0:3].astype(float)

    # Computes the reward functinon based on the car position.
    def __compute_reward(self, collision_info, car_state):
        #Define some constant parameters for the reward function
        THRESH_DIST = 3.5                # The maximum distance from the center of the road to compute the reward function
        DISTANCE_DECAY_RATE = 1.2        # The rate at which the reward decays for the distance function
        CENTER_SPEED_MULTIPLIER = 2.0    # The ratio at which we prefer the distance reward to the speed reward

        # If the car has collided, the reward is always zero
        if (collision_info.has_collided):
            return 0.0, True
        
        # If the car is stopped, the reward is always zero
        speed = car_state.speed
        if (speed < 2):
            return 0.0, True
        
        #Get the car position
        position_key = bytes('position', encoding='utf8')
        x_val_key = bytes('x_val', encoding='utf8')
        y_val_key = bytes('y_val', encoding='utf8')

        car_point = np.array([car_state.kinematics_true[position_key][x_val_key], car_state.kinematics_true[position_key][y_val_key], 0])
        
        # Distance component is exponential distance to nearest line
        distance = 999
        
        #Compute the distance to the nearest center line
        for line in self.__reward_points:
            local_distance = 0
            length_squared = ((line[0][0]-line[1][0])**2) + ((line[0][1]-line[1][1])**2)
            if (length_squared != 0):
                t = max(0, min(1, np.dot(car_point-line[0], line[1]-line[0]) / length_squared))
                proj = line[0] + (t * (line[1]-line[0]))
                local_distance = np.linalg.norm(proj - car_point)
            
            distance = min(local_distance, distance)
            
        distance_reward = math.exp(-(distance * DISTANCE_DECAY_RATE))
        
        return distance_reward, distance > THRESH_DIST

    # Initializes the points used for determining the starting point of the vehicle
    def __init_road_points(self):
        self.__road_points = []
        car_start_coords = [12961.722656, 6660.329102, 0]
        with open(os.path.join(os.path.join(self.__data_dir, 'data'), 'road_lines.txt'), 'r') as f:
            for line in f:
                points = line.split('\t')
                first_point = np.array([float(p) for p in points[0].split(',')] + [0])
                second_point = np.array([float(p) for p in points[1].split(',')] + [0])
                self.__road_points.append(tuple((first_point, second_point)))

        # Points in road_points.txt are in unreal coordinates
        # But car start coordinates are not the same as unreal coordinates
        for point_pair in self.__road_points:
            for point in point_pair:
                point[0] -= car_start_coords[0]
                point[1] -= car_start_coords[1]
                point[0] /= 100
                point[1] /= 100
              
    # Initializes the points used for determining the optimal position of the vehicle during the reward function
    def __init_reward_points(self):
        self.__reward_points = []
        with open(os.path.join(os.path.join(self.__data_dir, 'data'), 'reward_points.txt'), 'r') as f:
            for line in f:
                point_values = line.split('\t')
                first_point = np.array([float(point_values[0]), float(point_values[1]), 0])
                second_point = np.array([float(point_values[2]), float(point_values[3]), 0])
                self.__reward_points.append(tuple((first_point, second_point)))

    # Randomly selects a starting point on the road
    # Used for initializing an iteration of data generation from AirSim
    def __get_next_starting_point(self):
    
        # Get the current state of the vehicle
        car_state = self.__car_client.getCarState()

        # Pick a random road.
        random_line_index = np.random.randint(0, high=len(self.__road_points))
        
        # Pick a random position on the road. 
        # Do not start too close to either end, as the car may crash during the initial run.
        random_interp = (np.random.random_sample() * 0.4) + 0.3
        
        # Pick a random direction to face
        random_direction_interp = np.random.random_sample()

        # Compute the starting point of the car
        random_line = self.__road_points[random_line_index]
        random_start_point = list(random_line[0])
        random_start_point[0] += (random_line[1][0] - random_line[0][0])*random_interp
        random_start_point[1] += (random_line[1][1] - random_line[0][1])*random_interp

        # Compute the direction that the vehicle will face
        # Vertical line
        if (np.isclose(random_line[0][1], random_line[1][1])):
            if (random_direction_interp > 0.5):
                random_direction = (0,0,0)
            else:
                random_direction = (0, 0, math.pi)
        # Horizontal line
        elif (np.isclose(random_line[0][0], random_line[1][0])):
            if (random_direction_interp > 0.5):
                random_direction = (0,0,math.pi/2)
            else:
                random_direction = (0,0,-1.0 * math.pi/2)

        # The z coordinate is always zero
        random_start_point[2] = -0
        return (random_start_point, random_direction)

    # A helper function to make a directory if it does not exist
    def __make_dir_if_not_exist(self, directory):
        if not (os.path.exists(directory)):
            try:
                os.makedirs(directory)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
Esempio n. 4
0
class DistributedAgent():
    def __init__(self):
        self.__model_buffer = None
        self.__model = None
        self.__airsim_started = False
        self.__data_dir = 'data/'
        self.__handle_dir = 'data/handle_image/'
        self.__per_iter_epsilon_reduction = 0.003
        self.__min_epsilon = 0.1
        self.__max_epoch_runtime_sec = float(30)
        self.__replay_memory_size = 50
        self.__batch_size = 32
        self.__experiment_name = 'local_run'
        self.__train_conv_layers = False
        self.__epsilon = 1
        self.__percent_full = 0
        self.__num_batches_run = 0
        self.__last_checkpoint_batch_count = 0
        self.__handles = {}
        self.__batch_update_frequency = 10
        self.__weights_path = None    
        self.__airsim_path = '../AD_Cookbook_AirSim/'
        self.__local_run = True
        self.__car_client = None
        self.__car_controls = None
        # self.__minibatch_dir = os.path.join(self.__data_dir, 'minibatches')
        # self.__output_model_dir = os.path.join(self.__data_dir, 'models')
        # self.__make_dir_if_not_exist(self.__minibatch_dir)
        # self.__make_dir_if_not_exist(self.__output_model_dir)
        self.__last_model_file = ''
        self.__possible_ip_addresses = []
        self.__trainer_ip_address = None
        self.__experiences = {}
        self.prev_steering = 0
        self.__init_road_points()
        self.__init_reward_points()
        self.__init_handle_images()
        self.__best_drive = datetime.timedelta(seconds=-1)  #added 2021-03-09 by kang
        self.__best_model = None  #added 2021-03-09 by kang
        self.__num_of_trial = 0

    def start(self):
        self.__run_function()

    def __run_function(self):
        self.__model = RlModel(self.__weights_path, self.__train_conv_layers)
        self.__connect_to_airsim()
        print('Filling replay memory...')
        while True:
            print('Running Airsim Epoch.')
            try:
                if self.__percent_full < 100:
                    self.__run_airsim_epoch(True)
                    self.__percent_full = 100.0 * len(self.__experiences['actions'])/self.__replay_memory_size
                    print('Replay memory now contains {0} members. ({1}% full)'.format(len(self.__experiences['actions']), self.__percent_full))
                else:
                    if (self.__model is not None):
                        print('Running Airsim Epoch.')
                        experiences, frame_count, drive_time = self.__run_airsim_epoch(False)
                        # If we didn't immediately crash, train on the gathered experiences
                        if (frame_count > 0):
                            print('Generating {0} minibatches...'.format(frame_count))
                            print('Sampling Experiences.')
                            # Sample experiences from the replay memory
                            sampled_experiences = self.__sample_experiences(experiences, frame_count, True)

                            self.__num_batches_run += frame_count
                            
                            # If we successfully sampled, train on the collected minibatches and send the gradients to the trainer node
                            if (len(sampled_experiences) > 0):
                                print('Publishing AirSim Epoch.')
                                self.__publish_batch_and_update_model(sampled_experiences, frame_count, drive_time)
                
            except msgpackrpc.error.TimeoutError:
                print('Lost connection to AirSim while fillling replay memory. Attempting to reconnect.')
                self.__connect_to_airsim()

    def __connect_to_airsim(self):
        attempt_count = 0
        while True:
            try:
                print('Attempting to connect to AirSim (attempt {0})'.format(attempt_count))
                self.__car_client = CarClient()
                self.__car_client.confirmConnection()
                self.__car_client.enableApiControl(True)
                self.__car_controls = CarControls()
                print('Connected!')
                return
            except:
                print('Failed to connect.')
                attempt_count += 1
                if (attempt_count % 10 == 0):
                    print('10 consecutive failures to connect. Attempting to start AirSim on my own.')
                    os.system('START "" powershell.exe {0}'.format(os.path.join(self.__airsim_path, 'AD_Cookbook_Start_AirSim.ps1 neighborhood -windowed')))
                print('Waiting a few seconds.')
                time.sleep(10)

    def __run_airsim_epoch(self, always_random):
        starting_points, starting_direction = self.__get_next_starting_point()
        # state_buffer_len = 4 changed by kang 2021-03-09 cuz of no use
        state_buffer = []
        wait_delta_sec = 0.01
        self.__car_client.simSetPose(Pose(Vector3r(starting_points[0], starting_points[1], starting_points[2]), AirSimClientBase.toQuaternion(starting_direction[0], starting_direction[1], starting_direction[2])), True)
        self.__car_controls.steering = 0
        self.__car_controls.throttle = 1
        self.__car_controls.brake = 0
        self.prev_steering = 0
        self.__car_client.setCarControls(self.__car_controls)
        time.sleep(1.5)
        state_buffer = self.__get_image()

        done = False
        actions = []
        pre_states = []
        post_states = []
        rewards = []
        predicted_rewards = []
        car_state = self.__car_client.getCarState()

        start_time = datetime.datetime.utcnow()
        end_time = start_time + datetime.timedelta(seconds=self.__max_epoch_runtime_sec)
        
        num_random = 0
        far_off = False

        while not done:
            collision_info = self.__car_client.getCollisionInfo()
            utc_now = datetime.datetime.utcnow()
            
            if (collision_info.has_collided or car_state.speed < 1 or utc_now > end_time or far_off):
                print('Start time: {0}, end time: {1}'.format(start_time, utc_now), file=sys.stderr)
                self.__car_controls.steering = 0
                self.__car_controls.throttle = 0
                self.__car_controls.brake = 1
                self.__car_client.setCarControls(self.__car_controls)
                time.sleep(4)
                if (utc_now > end_time):
                    print('timed out.')
                    print('Full autonomous run finished at {0}'.format(utc_now), file=sys.stderr)
                done = True
                sys.stderr.flush()
            else:
                # The Agent should occasionally pick random action instead of best action
                do_greedy = np.random.random_sample()
                pre_state = copy.deepcopy(state_buffer)
                angle = -int(self.prev_steering/0.05*4)
                pre_handle = self.__handles[angle].reshape(59,255,1)
                pre_state = np.concatenate([pre_state, pre_handle], axis=2)
                if (do_greedy < self.__epsilon or always_random):
                    num_random += 1
                    next_state = self.__model.get_random_state()
                    predicted_reward = 0
                else:
                    next_state, predicted_reward = self.__model.predict_state(pre_state)
                    print('Model predicts {0}'.format(next_state))
                # Convert the selected state to a control signal
                next_control_signals = self.__model.state_to_control_signals(next_state, self.__car_client.getCarState())

                # Take the action
                self.__car_controls.steering = self.prev_steering + next_control_signals[0]
                if self.__car_controls.steering > 1.0:
                    self.__car_controls.steering = 1.0
                elif self.__car_controls.steering < -1.0:
                    self.__car_controls.steering = -1.0
                self.prev_steering = self.__car_controls.steering
                print('change steering : ', self.prev_steering)
                self.__car_controls.throttle = next_control_signals[1]
                self.__car_controls.brake = next_control_signals[2]
                self.__car_client.setCarControls(self.__car_controls)
                
                # Wait for a short period of time to see outcome
                time.sleep(wait_delta_sec)

                # Observe outcome and compute reward from action
                state_buffer = self.__get_image()
                angle = -int(self.prev_steering/0.05*4)
                post_handle = self.__handles[angle].reshape(59,255,1)
                post_state = np.concatenate([state_buffer, post_handle],axis=2)
                car_state = self.__car_client.getCarState()
                collision_info = self.__car_client.getCollisionInfo()
                reward, far_off = self.__compute_reward(collision_info, car_state)
                
                # Add the experience to the set of examples from this iteration
                pre_states.append(pre_state)
                post_states.append(post_state)
                rewards.append(reward)
                predicted_rewards.append(predicted_reward)
                actions.append(next_state)
        # action수가 너무 적을경우, 그 회차의 학습을 진행하지 않음. #added 2021-03-09 by kang
        if len(actions) < 10:
            return self.__experiences, 0, 0

        is_not_terminal = [1 for i in range(0, len(actions)-1, 1)]
        is_not_terminal.append(0)
        self.__add_to_replay_memory('pre_states', pre_states)
        self.__add_to_replay_memory('post_states', post_states)
        self.__add_to_replay_memory('actions', actions)
        self.__add_to_replay_memory('rewards', rewards)
        self.__add_to_replay_memory('predicted_rewards', predicted_rewards)
        self.__add_to_replay_memory('is_not_terminal', is_not_terminal)

        print('Percent random actions: {0}'.format(num_random / max(1, len(actions))))
        print('Num total actions: {0}'.format(len(actions)))
        
        if not always_random:
            self.__epsilon -= self.__per_iter_epsilon_reduction
            self.__epsilon = max(self.__epsilon, self.__min_epsilon)
        
        return self.__experiences, len(actions), utc_now - start_time

    def __add_to_replay_memory(self, field_name, data):
        if field_name not in self.__experiences:
            self.__experiences[field_name] = data
        else:
            self.__experiences[field_name] += data
            start_index = max(0, len(self.__experiences[field_name]) - self.__replay_memory_size)
            self.__experiences[field_name] = self.__experiences[field_name][start_index:]

    def __sample_experiences(self, experiences, frame_count, sample_randomly):
        sampled_experiences = {}
        sampled_experiences['pre_states'] = []
        sampled_experiences['post_states'] = []
        sampled_experiences['actions'] = []
        sampled_experiences['rewards'] = []
        sampled_experiences['predicted_rewards'] = []
        sampled_experiences['is_not_terminal'] = []
        # Compute the surprise factor, which is the difference between the predicted an the actual Q value for each state.
        # We can use that to weight examples so that we are more likely to train on examples that the model got wrong.
        suprise_factor = np.abs(np.array(experiences['rewards'], dtype=np.dtype(float)) - np.array(experiences['predicted_rewards'], dtype=np.dtype(float)))
        suprise_factor_normalizer = np.sum(suprise_factor)
        suprise_factor /= float(suprise_factor_normalizer)

        # Generate one minibatch for each frame of the run
        for _ in range(0, frame_count, 1):
            if sample_randomly:
                idx_set = set(np.random.choice(list(range(0, suprise_factor.shape[0], 1)), size=(self.__batch_size), replace=False))
            else:
                idx_set = set(np.random.choice(list(range(0, suprise_factor.shape[0], 1)), size=(self.__batch_size), replace=False, p=suprise_factor))
        
            sampled_experiences['pre_states'] += [experiences['pre_states'][i] for i in idx_set]
            sampled_experiences['post_states'] += [experiences['post_states'][i] for i in idx_set]
            sampled_experiences['actions'] += [experiences['actions'][i] for i in idx_set]
            sampled_experiences['rewards'] += [experiences['rewards'][i] for i in idx_set]
            sampled_experiences['predicted_rewards'] += [experiences['predicted_rewards'][i] for i in idx_set]
            sampled_experiences['is_not_terminal'] += [experiences['is_not_terminal'][i] for i in idx_set]
            
        return sampled_experiences
        
    def __publish_batch_and_update_model(self, batches, batches_count, drive_time): # added 2021-03-09 by kang
        # Train and get the gradients
        print('Publishing epoch data and getting latest model from parameter server...')
        gradients = self.__model.get_gradient_update_from_batches(batches) 
    
        if (self.__num_batches_run > self.__batch_update_frequency + self.__last_checkpoint_batch_count):
            self.__model.update_critic()
            
            checkpoint = {}
            checkpoint['model'] = self.__model.to_packet(get_target=True)
            checkpoint['batch_count'] = batches_count
            checkpoint_str = json.dumps(checkpoint)

            checkpoint_dir = os.path.join(os.path.join(self.__data_dir, 'checkpoint'), self.__experiment_name)
            
            if not os.path.isdir(checkpoint_dir):
                try:
                    os.makedirs(checkpoint_dir)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise
                        
            file_name = os.path.join(checkpoint_dir,'{0}.json'.format(self.__num_batches_run)) 
            with open(file_name, 'w') as f:
                print('Checkpointing to {0}'.format(file_name))
                f.write(checkpoint_str)
            
            self.__last_checkpoint_batch_count = self.__num_batches_run
            
            # 운행시간을 이용해서 가장 오래 걸린 시간을 best policy로 보고, best policy를 따로 저장. #added 2021-03-09 by kang
            if drive_time > self.__best_drive:
                print("="*30)
                print("New Best Policy!!!!!!")
                print("="*30)
                self.__best_drive = drive_time
                bestpoint_dir = os.path.join(os.path.join(self.__data_dir, 'bestpoint'), self.__experiment_name)
                if not os.path.isdir(bestpoint_dir):
                    try:
                        os.makedirs(bestpoint_dir)
                    except OSError as e:
                        if e.errno != errno.EEXIST:
                            raise
                file_name = os.path.join(bestpoint_dir,'{0}.json'.format(self.__num_batches_run)) 

                with open(file_name, 'w') as f:
                    print('Add Best Policy to {0}'.format(file_name))
                    f.write(checkpoint_str)
                self.__best_model = self.__model
                self.__best_experiences = self.__experiences

            elif self.__num_of_trial > 10:
                print("="*30)
                print("Reload best Model")
                print("="*30)
                self.__model = self.__best_model
                self.__experiences = self.__best_experiences
                self.__num_of_trial = 0
            self.__num_of_trial += 1

    def __compute_reward(self, collision_info, car_state):
        #Define some constant parameters for the reward function
        THRESH_DIST = 3.5                # The maximum distance from the center of the road to compute the reward function
        DISTANCE_DECAY_RATE = 1.2        # The rate at which the reward decays for the distance function
        CENTER_SPEED_MULTIPLIER = 2.0    # The ratio at which we prefer the distance reward to the speed reward

        # If the car has collided, the reward is always zero
        # 충돌 시 reward를 음수로 줘보았음.
        if (collision_info.has_collided):