Example #1
0
class DDPG(object):
    """deep deterministic policy gradient
    """
    def __init__(self,
                 n_state,
                 n_action,
                 a_bound,
                 gamma=0.99,
                 tau=0.01,
                 actor_lr=0.0005,
                 critic_lr=0.001,
                 noise_std=0.1,
                 noise_decay=0.9995,
                 noise_decay_steps=1000,
                 buffer_size=20000,
                 save_interval=5000,
                 assess_interval=10,
                 logger=None,
                 checkpoint_queen=None):
        self.logger = logger
        self.logger.save_config(locals())
        self.n_action = n_action
        self.n_state = n_state
        self.a_bound = a_bound
        self.noise_std = noise_std
        self.noise_decay = noise_decay
        self.noise_decay_steps = noise_decay_steps
        self.pointer = 0
        self.buffer_size = buffer_size
        self.save_interval = save_interval
        self.assess_interval = assess_interval
        self.actor = Actor(self.n_state,
                           self.n_action,
                           gamma=gamma,
                           lr=actor_lr,
                           tau=tau,
                           l2_reg=0)
        self.critic = Critic(self.n_state,
                             self.n_action,
                             gamma=gamma,
                             lr=critic_lr,
                             tau=tau,
                             l2_reg=0)
        self.merge = self._merge_summary()
        self.ckpt_queen = checkpoint_queen

        self.prefix = self.__class__.__name__.lower()

    def _merge_summary(self):
        tf.summary.histogram('critic_output', self.critic.model.output)
        tf.summary.histogram('actor_output', self.actor.model.output)
        tf.summary.histogram('critic_dense1',
                             self.critic.model.get_layer('l1').weights[0])
        tf.summary.histogram('actor_dense1',
                             self.actor.model.get_layer('l1').weights[0])
        tf.summary.histogram('critic_dense2',
                             self.critic.model.get_layer('l2').weights[0])
        tf.summary.histogram('actor_dense2',
                             self.actor.model.get_layer('l2').weights[0])
        return tf.summary.merge_all()

    def policy_action(self, state):
        return self.actor.predict(state)

    def bellman_q_value(self, rewards, q_nexts, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        q_target = np.zeros_like(
            rewards)  # asarry( copy = False), array(cope=True)
        for i in range(rewards.shape[0]):
            if dones[i]:
                q_target[i] = rewards[i]
            else:
                q_target[i] = rewards[i] + self.critic.gamma * q_nexts[i]
        return q_target

    def update_model(self, states, actions, q_values):
        # train critic
        loss_names, loss_values = self.critic.train_on_batch(
            states, actions, q_values)

        # train actor
        # p_actions = self.actor.predict(states)  #actions with no noise
        grad_ys = self.critic.gradients(
            states, self.actor.predict(states))  #(batch, n-action)
        actor_output = self.actor.train(states, self.actor.predict(states),
                                        grad_ys)

        # copy network
        self.actor.copy_weights()
        self.critic.copy_weights()

        # print(grad_ys, grad_ys.shape)
        # print(actor_output[0],actor_output[0].shape)
        # print(np.mean(grad_ys*actor_output[0]))

        return loss_names, loss_values, grad_ys, actor_output

    def save_weights(self, path):
        self.actor.save(path)
        self.critic.save(path)

    def save_model(self, path, file):
        self.actor.model.save(
            os.path.join(path, self.prefix + '_actor_' + file + '.h5'))
        self.critic.model.save(
            os.path.join(path, self.prefix + '_critic_' + file + '.h5'))

    def checkpoint(self, path, step, metric_value):
        signature = str(step) + '_' + '{:.4f}'.format(metric_value)
        to_delete, need_save = self.ckpt_queen.add((metric_value, signature))
        if to_delete:
            actor = os.path.join(
                path, self.prefix + '_actor_' + to_delete[1] + '.h5')
            critic = os.path.join(
                path, self.prefix + '_critic_' + to_delete[1] + '.h5')
            os.remove(actor)
            os.remove(critic)
        if need_save:
            self.save_model(path, signature)

    def train(self,
              args,
              summary_writer,
              train_data=None,
              val_data=None,
              test_data=None):
        results = []
        max_val_rate = 0
        val_data = np.asarray(val_data)  # none will be array(None)
        # First, gather experience
        tqdm_e = tqdm(range(args.batchs),
                      desc='score',
                      leave=True,
                      unit=" epoch")
        if train_data is None:
            dataset = CsvBuffer(args.file_dir,
                                args.reg_pattern,
                                chunksize=args.batch_size)  # 100*(20+1)
            assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available'
        # noise = OrnsteinUhlenbeckProcess(size=self.n_action)
        else:
            dataset = Dataset(train_data, args.batch_size, shuffle=True)
        for e in tqdm_e:
            batch_data = next(dataset)
            states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int)

            a = self.policy_action(states)  #(batch, n_action)
            a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape),
                        self.a_bound[0], self.a_bound[1])
            # a = np.clip(np.random.normal(a, self.noise_std), self.a_bound[0], self.a_bound[1])
            # a = np.clip(a + noise.generate(time, a.shape[0]), self.a_bound[0], self.a_bound[1])
            llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5)
            r = np.where(labels == 1, llr.ravel(), -llr.ravel())  #(batch,)
            # q_nexts = self.critic.target_predict(new_states, self.actor.target_predict(new_states))
            q_ = self.bellman_q_value(rewards=r,
                                      q_nexts=0,
                                      dones=[True] * r.shape[0])  #(batch,)
            loss_names, loss_values, grad_ys, actor_output = self.update_model(
                states, a, q_.reshape(-1, 1))

            score = r.mean()

            if ((e + 1) % self.noise_decay_steps - 1) == 0:
                self.noise_std *= self.noise_decay
                self.logger.log_tabular('noise', self.noise_std)
            if e % self.assess_interval == 0 or e == args.batchs - 1:
                if val_data is not None:
                    val_pred = self.actor.predict(val_data[:, :-1])
                    val_y = val_data[:, -1]
                    val_rate, top_k = top_ratio_hit_rate(
                        val_y.ravel(), val_pred.ravel())
                    self.logger.log_tabular('val_rate', val_rate)
                    self.logger.log_tabular('val_k', int(top_k))
                    self.checkpoint(args.model_path, e, val_rate)
                    max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate
                if test_data is not None:
                    test_pred = self.actor.predict(test_data[:, :-1])
                    test_y = test_data[:, -1]
                    test_rate, top_k = top_ratio_hit_rate(
                        test_y, test_pred.ravel())
                    self.logger.log_tabular('test_rate', test_rate)
                    self.logger.log_tabular('test_k', int(top_k))

                summary_writer.add_summary(tf_summary(['mean-reward'],
                                                      [score]),
                                           global_step=e)
                summary_writer.add_summary(tf_summary(loss_names,
                                                      [loss_values]),
                                           global_step=e)
                merge = keras.backend.get_session().run(
                    self.merge,
                    feed_dict={
                        self.critic.model.input[0]: states,
                        self.critic.model.input[1]: a,
                        self.actor.model.input: states
                    })
                summary_writer.add_summary(merge, global_step=e)

            for name, val in zip(loss_names, [loss_values]):
                self.logger.log_tabular(name, val)
            # print(grad_ys,grad_ys.shape)
            # print(actor_output)
            self.logger.log_tabular(
                'dQ/da', '%.4f+%.4f' %
                (grad_ys.mean(), grad_ys.std()))  # grad_ys (batch,act_dim)
            self.logger.log_tabular(
                'aout',
                '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std()))
            self.logger.log_tabular('aloss', '%.4f' % (actor_output[1]))
            self.logger.log_tabular('reward', '%.4f+%.4f' % (score, r.std()))
            self.logger.dump_tabular()
            tqdm_e.set_description("score: " + '{:.4f}'.format(score))
            tqdm_e.set_postfix(noise_std='{:.4f}'.format(self.noise_std),
                               max_val_rate='{:.4f}'.format(max_val_rate),
                               val_rate='{:.4f}'.format(val_rate),
                               top_k=top_k)
            tqdm_e.refresh()

        return results
Example #2
0
def run_carla_client(args):
    # Here we will run 3 episodes with 300 frames each.
    number_of_episodes = 60000
    frames_per_episode = 400

    # We assume the CARLA server is already waiting for a client to connect at
    # host:port. To create a connection we can use the `make_carla_client`
    # context manager, it creates a CARLA client object and starts the
    # connection. It will throw an exception if something goes wrong. The
    # context manager makes sure the connection is always cleaned up on exit.
    with make_carla_client(args.host, args.port, 30) as client:
        print('CarlaClient connected')

        # =============================================================================
        #       Global initialisations
        # =============================================================================
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        K.set_session(sess)

        state_size = {
            'state_2D': (
                64,
                64,
                9,
            ),
            'state_1D': (17, )
        }
        action_size = (5, )

        critic = Critic(sess, state_size, action_size, CRITIC_LR)
        critic.target_train()
        actor = Actor(sess, state_size, action_size, ACTOR_LR)
        actor.target_train()
        memory = ExperienceMemory(100000, False)

        target_update_counter = 0
        target_update_freq = TARGET_UPDATE_BASE_FREQ

        explore_rate = 0.2

        success_counter = 0

        total_t = 0
        t = 0
        #NOTE Ez csak egy próba, eztmég át kell alakítani
        target = {
            'pos': np.array([-3.7, 236.4, 0.9]),
            'ori': np.array([0.00, -1.00, 0.00])
        }

        if args.settings_filepath is None:
            # Create a CarlaSettings object. This object is a wrapper around
            # the CarlaSettings.ini file. Here we set the configuration we
            # want for the new episode.
            settings = CarlaSettings()
            settings.set(SynchronousMode=True,
                         SendNonPlayerAgentsInfo=True,
                         NumberOfVehicles=0,
                         NumberOfPedestrians=0,
                         WeatherId=random.choice([1]),
                         QualityLevel=args.quality_level)
            #            settings.randomize_seeds()
            #
            #            settings.randomize_seeds()
            # The default camera captures RGB images of the scene.
            camera0 = Camera('CameraRGB')
            # Set image resolution in pixels.
            camera0.set_image_size(64, 64)
            # Set its position relative to the car in centimeters.
            camera0.set_position(0.30, 0, 1.30)
            settings.add_sensor(camera0)
        else:

            # Alternatively, we can load these settings from a file.
            with open(args.settings_filepath, 'r') as fp:
                settings = fp.read()
        scene = client.load_settings(settings)

        # =============================================================================
        #       EPISODES LOOP
        # =============================================================================
        for episode in range(0, number_of_episodes):
            # Start a new episode.
            # Choose one player start at random.
            number_of_player_starts = len(scene.player_start_spots)
            player_start = random.randint(0, max(0,
                                                 number_of_player_starts - 1))
            player_start = 0
            total_reward = 0.
            # Notify the server that we want to start the episode at the
            # player_start index. This function blocks until the server is ready
            # to start the episode.
            print('Starting new episode...')
            client.start_episode(player_start)

            #TODO Ezen belül kéne implementálni a tanuló algoritmusunkat

            # =============================================================================
            #           Episodic intitialisations
            # =============================================================================
            collisions = {'car': 0, 'ped': 0, 'other': 0}
            reverse = -1.0
            measurements, sensor_data = client.read_data()
            state = get_state_from_data(measurements, sensor_data, reverse)
            goal = get_goal_from_data(target)
            t = 0
            stand_still_counter = 0
            # =============================================================================
            #           STEPS LOOP
            # =============================================================================
            for frame in range(0, frames_per_episode):
                t = t + 1
                total_t += 1
                target_update_counter += 1
                explore_dev = 0.6 / (1 + total_t / 30000)
                explore_rate = 0.3 / (1 + total_t / 30000)
                # Print some of the measurements.
                #   print_measurements(measurements)

                # Save the images to disk if requested.
                if args.save_images_to_disk and False:
                    for name, measurement in sensor_data.items():
                        filename = args.out_filename_format.format(
                            episode, name, frame)
                        measurement.save_to_disk(filename)

                if state['state_1D'][9] < 5 and t > 50:
                    stand_still_counter += 1
                else:
                    stand_still_counter = 0
                #Calculate the action
                a_pred = actor.model.predict([
                    np.expand_dims(state['state_2D'], 0),
                    np.expand_dims(np.concatenate((state['state_1D'], goal)),
                                   0)
                ])[0]
                #Add exploration noise to action
                a = add_noise(a_pred, explore_dev, explore_rate)
                control = get_control_from_a(a)
                #Sendcontrol to the server
                client.send_control(control)

                #
                # =============================================================================
                #               TRAINING THE NETWORKS
                # =============================================================================
                if memory.num_items > 6000:
                    batch, indeces = memory.sample_experience(MINI_BATCH_SIZE)
                    raw_states = [[e[0]['state_2D'], e[0]['state_1D']]
                                  for e in batch]
                    goals = np.asarray([e[5] for e in batch])
                    states = {
                        'state_2D':
                        np.atleast_2d(np.asarray([e[0]
                                                  for e in raw_states[:]])),
                        'state_1D':
                        np.atleast_2d(
                            np.asarray([
                                np.concatenate([e[1], goals[i]], axis=-1)
                                for i, e in enumerate(raw_states[:])
                            ]))
                    }

                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([np.sum(e[2])
                                          for e in batch]).reshape(-1, 1)

                    raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']]
                                      for e in batch]
                    new_states = {
                        'state_2D':
                        np.atleast_2d(
                            np.asarray([e[0] for e in raw_new_states[:]])),
                        'state_1D':
                        np.atleast_2d(
                            np.asarray([
                                np.concatenate([e[1], goals[i]], axis=-1)
                                for i, e in enumerate(raw_new_states[:])
                            ]))
                    }

                    overs = np.asarray([e[4] for e in batch]).reshape(-1, 1)

                    best_a_preds = actor.target_model.predict(
                        [new_states['state_2D'], new_states['state_1D']])
                    max_qs = critic.target_model.predict([
                        new_states['state_2D'], new_states['state_1D'],
                        best_a_preds
                    ])

                    ys = rewards + (1 - overs) * GAMMA * max_qs
                    #Train Critic network
                    critic.model.train_on_batch(
                        [states['state_2D'], states['state_1D'], actions], ys)
                    #Train Actor network
                    a_for_grads = actor.model.predict(
                        [states['state_2D'], states['state_1D']])
                    a_grads = critic.gradients(states, a_for_grads)
                    actor.train(states, a_grads)

                    #Train target networks
                    if target_update_counter >= int(target_update_freq):
                        target_update_counter = 0
                        target_update_freq = target_update_freq * TARGET_UPDATE_MULTIPLIER
                        critic.target_train()
                        actor.target_train()
# =============================================================================
#               GET AND STORE OBSERVATIONS
# =============================================================================
#Get next measurements
                measurements, sensor_data = client.read_data()
                new_state = get_state_from_data(measurements, sensor_data,
                                                reverse, state)

                #TODO Calculate reward
                r_goal, success = calculate_goal_reward(
                    np.atleast_2d(new_state['state_1D']), goal)
                r_general, collisions = calculate_general_reward(
                    measurements, collisions)
                over = stand_still_counter > 30 or success
                success_counter += int(bool(success) * 1)
                total_reward += r_goal
                total_reward += r_general
                #Store observation
                if t > 10:
                    experience = pd.DataFrame(
                        [[
                            state, a,
                            np.array([r_goal, r_general]), new_state,
                            bool(over), goal, episode, 0
                        ]],
                        columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'],
                        copy=True)
                    memory.add_experience(experience)

                #Set the state to the next state
                state = new_state
                if over:
                    break
            sub_goal = deepcopy(state['state_1D'][0:6])
            print(str(episode) + ". Episode###################")
            print("Total reward: " + str(total_reward))
            print("Success counter: " + str(success_counter))
            if (episode % 10 == 0):
                print("############## DEBUG LOG ################")
                print("Memory state: " + str(memory.num_items))
                print("Target update counter: " + str(target_update_counter))
                print("Exploration rate: " + str(explore_rate))
                print("Exploration dev: " + str(explore_dev))
                print("Total timesteps: " + str(total_t))
                print("Average episode length: " + str(total_t /
                                                       (episode + 1)))
                print("#########################################")


# =============================================================================
#           REPLAY FOR SUBGOALS
# =============================================================================
            batch = memory.get_last_episode(t)
            raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']]
                              for e in batch]
            new_states = {
                'state_2D':
                np.atleast_2d(np.asarray([e[0] for e in raw_new_states[:]])),
                'state_1D':
                np.atleast_2d(np.asarray([e[1] for e in raw_new_states[:]]))
            }
            rewards = np.asarray([e[2] for e in batch]).reshape(-1, 2)
            r_subgoal = calculate_goal_reward(new_states['state_1D'],
                                              sub_goal)[0]
            rewards[:, 0] = r_subgoal
            subgoal_batch = [[
                v[0], v[1],
                list(rewards)[i], v[3], v[4], sub_goal, v[6], v[7]
            ] for i, v in enumerate(batch)]
            experiences = pd.DataFrame(
                subgoal_batch,
                columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'],
                copy=True)
            memory.add_experience(experiences)
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """

    def __init__(self, act_dim, env_dim, act_range, buffer_size = 20000, gamma=0.99, lr=0.00005, tau=0.001):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = env_dim
        self.gamma = gamma
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.demo_actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        self.buffer = Replay()
        self.batch_size = 2000

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)

        # if dones:
        #     critic_target[0] = rewards
        # else:
        #     critic_target[0] = rewards + self.gamma * q_values

        for i in range(q_values.shape[0]):
            if dones:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        self.buffer.append((state, action, reward, done, new_state))

    def sample_batch(self):
        return self.buffer.sample_batch(self.batch_size)

    def update_models(self, states, actions, critic_target, actor_res, demo_actor_res):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        # print('critic_target', critic_target)
        self.critic.train_on_batch(states, actions, critic_target, actor_res, demo_actor_res)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        demo_actions = self.demo_actor.model.predict(states)
        demo_grads = self.critic.gradients(states, demo_actions)
        # Train actor
        self.actor.train(states, actions, np.array(grads).reshape((-1, self.act_dim)))
        self.demo_actor.train(states, demo_actions, np.array(demo_grads).reshape((-1, self.act_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.demo_actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, env):
        results = []

        # First, gather experience
        tqdm_e = tqdm(range(50000), desc='Score', leave=True, unit=" episodes")
        success = []
        for e in tqdm_e:

            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)
            blockPos, blockOrn = p.getBasePositionAndOrientation(env.blockUid)
            experience = []

            while not done:
                # Actor picks an action (following the deterministic policy)
                old_state = get_state(env)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment

                gripperState  = p.getLinkState(env._kuka.kukaUid, env._kuka.kukaGripperIndex)
                gripperPos = gripperState[0]
                gripperOrn = gripperState[1]

                a = np.clip(a+noise.generate(time), -self.act_range, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                new_state = get_state(env)


                gripperState  = p.getLinkState(env._kuka.kukaUid, env._kuka.kukaGripperIndex)
                next_gripperPos = gripperState[0]
                next_gripperOrn = gripperState[1]

                # Add outputs to memory buffer
                experience.append((old_state, a, r, done, new_state, gripperPos, gripperOrn, next_gripperPos, next_gripperOrn, blockPos, blockOrn))
                # self.memorize(old_state, a, r, done, new_state)

                # HER replay, sample a new goal
                blockPos, blockOrn = gripperPos, gripperOrn
                step_size = len(experience)
                her_experience = []
                for t in range(step_size):
                    old_state, action, reward, done, next_state, gripperPos, gripperOrn, next_gripperPos, next_gripperOrn, _, _ = np.copy(experience[t])
                    blockInGripperPosXYEulZ = env.get_block_in_gripper_pos(gripperPos, gripperOrn, blockPos, blockOrn)
                    old_state[6:9] = blockInGripperPosXYEulZ
                    next_blockInGripperPosXYEulZ = env.get_block_in_gripper_pos(next_gripperPos, next_gripperOrn, blockPos, blockOrn)
                    next_state[6:9] = next_blockInGripperPosXYEulZ
                    if t == step_size - 1:
                        reward = 0.5
                    her_experience.append((old_state, action, reward, done, next_state, gripperPos, gripperOrn, next_gripperPos, next_gripperOrn, blockPos, blockOrn))

                self.train_batch()

                # Update current state
                old_state = new_state

                if r > 0:
                    print('r', r)
                    success.append(e)
                    print(success)
                cumul_reward += r
                time += 1
            self.buffer.memo.extend(experience)
            self.buffer.demo_memo.extend(her_experience)

            # Gather stats every episode for plotting
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def train_batch(self):
        if len(self.buffer.memo) > self.batch_size and len(self.buffer.demo_memo) > self.batch_size:
            # Sample experience from buffer
            sample_batch, sample_demo_batch = self.sample_batch()
            states = []
            actions = []
            rewards = []
            dones = []
            new_states = []
            samples_size = len(sample_batch)

            for state, action, reward, done, new_state, _, _, _, _, _, _ in sample_batch:
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                dones.append(done)
                new_states.append(new_state)

            new_states = np.reshape(np.array(new_states), (samples_size, -1))
            actor_res = self.actor.target_predict(new_states)
            demo_actor_res = self.demo_actor.target_predict(new_states)
            q_values = self.critic.target_predict([new_states, actor_res])[0]
            q_values = np.reshape(q_values, (samples_size, ))
            critic_targets = self.bellman(rewards, q_values, dones)
            states = np.array(states)
            actions = np.array(actions)
            self.update_models(states, actions, critic_targets, actor_res, demo_actor_res)
Example #4
0
class DDPG(object):
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """
    def __init__(self,
                 action_dim,
                 state_dim,
                 batch_size,
                 step,
                 buffer_size,
                 train_indicator,
                 episode,
                 gamma,
                 lra,
                 lrc,
                 tau,
                 load_weight=True):
        """ Initialization
        """
        # Environment and A2C parameters
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.batch_size = batch_size
        self.step = step
        self.gamma = gamma
        self.lra = lra
        self.lrc = lrc
        self.tau = tau
        self.episode = episode
        self.train_indicator = train_indicator
        # Create actor and critic networks
        self.actor = Actor(state_dim, action_dim, batch_size, lra, tau)
        self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau)
        self.buffer = MemoryBuffer(buffer_size)
        # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder
        self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5"

        if load_weight:
            try:
                weights_actor_path = ""
                weights_critic_path = ""
                weights_file_path = glob.glob(self.weights_dir_path)

                for file_path in weights_file_path:
                    if file_path.find("actor") < 0:
                        weights_critic_path = file_path
                    if file_path.find("critic") < 0:
                        weights_actor_path = file_path

                self.load_weights(weights_actor_path, weights_critic_path)

                print("")
                print("Actor-Critic Models are loaded with weights...")
                print("")
            except:
                print("")
                print(
                    "Weights are failed to be loaded, please check weights loading path..."
                )
                print("")

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target (one action only)
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state_old, action, reward, done, state_new):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state_old, action, reward, done, state_new)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions,
                         np.array(grads).reshape((-1, self.action_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def run(self, env):
        # First, gather experience
        for e in range(self.episode):
            # Reset episode
            # set initial state
            loss, cumul_reward, cumul_loss = 0, 0, 0
            done = False
            state_old = env.get_vissim_state(
                1, 180 * 5, [45, 55, 60, 65, 70, 75, 80
                             ])  #TODO: make sure states are recieved correctly
            actions, states, rewards = [], [], []

            print("Episode: ", e, " ========================:")

            for t in range(self.step):
                action_original = self.policy_action(state_old)

                #TODO: OU function params?
                noise = OrnsteinUhlenbeckProcess(x0=action_original,
                                                 size=self.action_dim)

                # action = action_orig + noise
                action = noise.apply_ou(t)

                # adjust too-low or too-high action
                adj_action = np.zeros(len(action))
                for index, value in enumerate(action):
                    adj_action[index] = clip(value, -1, 1)

                #action_mapping function
                transformed_action = Transformation.convert_actions(adj_action)

                reward, state_new = env.get_vissim_reward(
                    180 * 5, transformed_action)

                # TODO: if we know what the optimal discharging rate, then we set that as done
                if t == self.step - 1:  #we consider the manually setted last step as done
                    done = True

                # ======================================================================================= Training section
                if (self.train_indicator):
                    # Add outputs to memory buffer
                    self.memorize(state_old, adj_action, reward, done,
                                  state_new)
                    # Sample experience from buffer
                    states_old, actions, rewards, dones, states_new = self.sample_batch(
                        self.batch_size)
                    # Predict target q-values using target networks
                    q_values = self.critic.target_predict(
                        [states_new,
                         self.actor.target_predict(states_new)])
                    # Compute critic target
                    critic_target = self.bellman(rewards, q_values, dones)
                    # Train both networks on sampled batch, update target networks
                    self.update_models(states_old, actions, critic_target)
                    # calculate loss
                    loss = self.critic.train_on_batch(states_old, actions,
                                                      critic_target)
                    state_old = state_new
                    cumul_reward += reward
                    cumul_loss += loss
                # =======================================================================================

                # ======================================================================================= report
                print("|---> Step: ", t, " | Action: ", transformed_action,
                      " | Reward: ", reward, " | Loss: ", loss)
                # =======================================================================================

            # ======================================================================================= save model
            if np.mod(e, 10) == 0:
                print("====================> Saving model...")
                self.save_weights("./saved_model/")
                """
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
                """
            # ======================================================================================= save model

            print("")
            print("*-------------------------------------------------*")
            print("Average Accumulated Reward: " +
                  str(cumul_reward / self.step))
            print("Average Accumulated Loss: " + str(cumul_loss / self.step))
            print("*-------------------------------------------------*")
            print("")

            # garbage recycling
            gc.collect()

    def save_weights(self, path):
        t = datetime.datetime.now()
        time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str(
            t.minute) + "m"
        path_actor = path + '_LR_{}'.format(self.lra) + time
        path_critic = path + '_LR_{}'.format(self.lrc) + time
        self.actor.save(path_actor)
        self.critic.save(path_critic)

    def load_weights(self, path_actor, path_critic):
        self.actor.load(path_actor)
        self.critic.load(path_critic)
Example #5
0
class TD3(object):
    """deep deterministic policy gradient
    """
    def __init__(self,
                 n_state,
                 n_action,
                 a_bound,
                 discount=0.99,
                 tau=0.05,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 policy_freq=2,
                 exp_noise_std=0.1,
                 noise_decay=0.9995,
                 noise_decay_steps=1000,
                 smooth_noise_std=0.1,
                 clip=0.2,
                 buffer_size=20000,
                 save_interval=5000,
                 assess_interval=20,
                 logger=None,
                 checkpoint_queen=None):
        #self.__dict__.update(locals())
        self.logger = logger
        self.logger.save_config(locals())
        self.n_action = n_action
        self.n_state = n_state
        self.a_bound = a_bound
        self.noise_std = exp_noise_std
        self.noise_decay = noise_decay
        self.noise_decay_steps = noise_decay_steps
        self.policy_freq = policy_freq
        self.smooth_noise_std = smooth_noise_std
        self.clip = clip
        self.discount = discount

        self.pointer = 0
        self.buffer = MemoryBuffer(buffer_size, with_per=True)
        self.save_interval = save_interval
        self.assess_interval = assess_interval
        self.actor = Actor(self.n_state,
                           self.n_action,
                           gamma=discount,
                           lr=actor_lr,
                           tau=tau)
        self.critic1 = Critic(self.n_state,
                              self.n_action,
                              gamma=discount,
                              lr=critic_lr,
                              tau=tau)
        self.critic2 = Critic(self.n_state,
                              self.n_action,
                              gamma=discount,
                              lr=critic_lr,
                              tau=tau)
        self.merge = self._merge_summary()
        self.ckpt_queen = checkpoint_queen
        self.prefix = self.__class__.__name__

    def _merge_summary(self):
        tf.summary.histogram('critic_output', self.critic1.model.output)
        tf.summary.histogram('actor_output', self.actor.model.output)
        tf.summary.histogram('critic_dense1',
                             self.critic1.model.get_layer('l1').weights[0])
        tf.summary.histogram('actor_dense1',
                             self.actor.model.get_layer('l1').weights[0])
        tf.summary.histogram('critic_dense2',
                             self.critic1.model.get_layer('l2').weights[0])
        tf.summary.histogram('actor_dense2',
                             self.actor.model.get_layer('l2').weights[0])
        return tf.summary.merge_all()

    def select_action(self, state):
        return self.actor.predict(state)

    def bellman_q_value(self, rewards, q_nexts, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        q_target = np.zeros_like(
            rewards)  #asarry( copy = False), array(cope=True)
        for i in range(rewards.shape[0]):
            if dones[i]:
                q_target[i] = rewards[i]
            else:
                q_target[i] = rewards[i] + self.discount * q_nexts[i]
        return q_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        if (self.buffer.with_per):
            q_val = reward
            q_val_t = self.critic1.target_predict(state, action)
            td_error = abs(q_val_t - q_val)[0]
            # print(td_error)
        else:
            td_error = 0
        state = state.reshape(-1)
        action = action.reshape(-1)
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_actor(self, states):
        actions = self.actor.predict(states)
        grad_ys = self.critic1.gradients(states, actions)
        actor_output = self.actor.train(states, actions, grad_ys)
        self.actor.copy_weights()
        self.critic1.copy_weights()
        self.critic2.copy_weights()
        return grad_ys, actor_output

    def update_critic(self, states, actions, q_values):
        loss_names, loss_values = self.critic1.train_on_batch(
            states, actions, q_values)
        self.critic2.train_on_batch(states, actions, q_values)
        return loss_names, loss_values

    def save_weights(self, path):
        self.actor.save(path)
        self.critic1.save(path)
        self.critic2.save(path)

    def save_model(self, path, file):
        self.actor.model.save(
            os.path.join(path, self.prefix + '_actor_' + file + '.h5'))
        self.critic1.model.save(
            os.path.join(path, self.prefix + '_critic1_' + file + '.h5'))
        self.critic2.model.save(
            os.path.join(path, self.prefix + '_critic2_' + file + '.h5'))

    def checkpoint(self, path, step, metric_value):
        signature = str(step) + '_' + '{:.4}'.format(metric_value)
        to_delete, need_save = self.ckpt_queen.add((metric_value, signature))
        if to_delete:
            delete_actor = os.path.join(
                path, self.prefix + '_actor_' + to_delete[1] + '.h5')
            delete_critic1 = os.path.join(
                path, self.prefix + '_critic1_' + to_delete[1] + '.h5')
            delete_critic2 = os.path.join(
                path, self.prefix + '_critic2_' + to_delete[1] + '.h5')
            os.remove(delete_actor)
            os.remove(delete_critic1)
            os.remove(delete_critic2)
        if need_save:
            self.save_model(path, signature)

    def train(self,
              args,
              summary_writer,
              train_data=None,
              val_data=None,
              test_data=None):
        results = []
        max_val_rate = 0
        val_data = np.asarray(val_data)  # none will be array(None)
        # First, gather experience
        tqdm_e = tqdm(range(args.batchs),
                      desc='score',
                      leave=True,
                      unit="epoch")
        if train_data is None:
            dataset = CsvBuffer(args.file_dir,
                                args.reg_pattern,
                                chunksize=args.batch_size)  # 100*(20+1)
            assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available'
        # noise = OrnsteinUhlenbeckProcess(size=self.n_action)
        else:
            dataset = Dataset(train_data, 1, shuffle=True)

        warm_up = 20 * args.batch_size
        for e in tqdm_e:
            batch_data = next(dataset)
            states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int)

            a = self.select_action(states)  #(batch, n_action)
            a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape),
                        self.a_bound[0], self.a_bound[1])
            llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5)
            # rewards = np.where(labels==1, llr.ravel(), -llr.ravel())  #(batch,)
            rewards = np.where(labels == 1,
                               np.where(llr > 0, llr.ravel(), 2 * llr.ravel()),
                               np.where(llr < 0, -llr.ravel(),
                                        -2 * llr.ravel()))  #(batch,)
            # print(rewards)

            # a_ = self.actor.target_predict(next_states)
            # noise = np.clip(np.random.normal(0, self.smooth_noise_std), 0, self.clip)
            # a_ = a_ + noise
            # q_next1 = self.critic1.target_predict(new_states, a_)
            # q_next2 = self.critic2.target_predict(new_states,a_)
            # q_nexts = np.where(q_next1<q_next2, q_next1, q_next2)
            self.memorize(states, a, rewards, True, None)
            if e < warm_up:
                continue

            states, a, rewards, _, _, _ = self.sample_batch(args.batch_size)
            # print(states.shape, a.shape, rewards.shape)

            q_ = self.bellman_q_value(rewards=rewards,
                                      q_nexts=0,
                                      dones=[True] *
                                      rewards.shape[0])  #(batch,)

            loss_names, loss_values = self.update_critic(
                states, a, q_.reshape(-1, 1))

            if e % self.policy_freq == 0 or e == warm_up:
                grad_ys, actor_output = self.update_actor(states)

            if ((e + 1) % self.noise_decay_steps - 1) == 0 or e == warm_up:
                self.noise_std *= self.noise_decay
                self.logger.log_tabular('noise', self.noise_std)
            if e % self.assess_interval == 0 or e == args.batchs - 1 or e == warm_up:
                if val_data is not None:
                    val_pred = self.actor.predict(val_data[:, :-1])
                    val_y = val_data[:, -1]
                    # print(val_pred.shape,val_pred[:10])
                    # print(val_y.shape, val_y[:10])
                    val_rate, top_k = top_ratio_hit_rate(
                        val_y.ravel(), val_pred.ravel())
                    self.logger.log_tabular('val_rate', val_rate)
                    self.logger.log_tabular('val_k', int(top_k))
                    self.checkpoint(args.model_path, e, val_rate)
                    max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate
                if test_data is not None:
                    test_pred = self.actor.predict(test_data[:, :-1])
                    test_y = test_data[:, -1]
                    test_rate, top_k = top_ratio_hit_rate(
                        test_y, test_pred.ravel())
                    self.logger.log_tabular('test_rate', test_rate)
                    self.logger.log_tabular('test_k', int(top_k))

            score = rewards.mean()
            summary_writer.add_summary(tf_summary(['mean-reward'], [score]),
                                       global_step=e)
            summary_writer.add_summary(tf_summary(loss_names, [loss_values]),
                                       global_step=e)
            merge = keras.backend.get_session().run(
                self.merge,
                feed_dict={
                    self.critic1.model.input[0]: states,
                    self.critic1.model.input[1]: a,
                    self.actor.model.input: states
                })
            summary_writer.add_summary(merge, global_step=e)

            for name, val in zip(loss_names, [loss_values]):
                self.logger.log_tabular(name, val)

            self.logger.log_tabular(
                'dQ/da', '%.4f+%.4f' %
                (grad_ys.mean(), grad_ys.std()))  # grad_ys (batch,act_dim)
            self.logger.log_tabular(
                'aout',
                '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std()))
            self.logger.log_tabular('aloss', '%.4f' % (actor_output[1]))
            self.logger.log_tabular('reward',
                                    '%.4f+%.4f' % (score, rewards.std()))
            self.logger.dump_tabular()
            tqdm_e.set_description("score: " + '{:.4f}'.format(score))
            tqdm_e.set_postfix(noise_std='{:.4}'.format(self.noise_std),
                               max_val_rate='{:.4}'.format(max_val_rate),
                               val_rate='{:.4}'.format(val_rate),
                               top_k=top_k)
            tqdm_e.refresh()

        return results
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    # np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    # G = tf.Graph()
    # sess = tf.Session(config=config)
    # tf.reset_default_graph()
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = Actor(sess, BATCH_SIZE, TAU, LRA)
    critic = Critic(sess, BATCH_SIZE, TAU, LRC)

    sess.run(tf.global_variables_initializer())
    # actor = Actor( BATCH_SIZE, TAU, LRA)
    # critic = Critic( BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # print("Now we load the weight")
    # try:
    #     actor.model.load_weights("actormodel.h5")
    #     critic.model.load_weights("criticmodel.h5")
    #     actor.target_model.load_weights("actormodel.h5")
    #     critic.target_model.load_weights("criticmodel.h5")
    #     print("Weight load successfully")
    # except:
    #     print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            # print("hey",s_t.shape)
            a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])
            # print("hey",new_states.shape)
            target_q_values = critic.target_predict(
                [new_states, actor.target_predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                # print("main",states.dtype)
                # print("main",actions.dtype)
                # print("main",y_t.dtype)
                loss += critic.train([states, actions], y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                # print("grads : ",grads.dtype)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break
        if np.mod(step, 1000):
            print("target_q_values : ", target_q_values)
        if np.mod(i, 3) == 0:
            if (train_indicator):
                pass
                # saver = tf.train.Saver()
                # saver.save(sess, save_path = 'weights/model.ckpt',global_step=1000)
        #         print("Now we save model")
        #         actor.model.save_weights("actormodel.h5", overwrite=True)
        #         with open("actormodel.json", "w") as outfile:
        #             json.dump(actor.model.to_json(), outfile)

        #         critic.model.save_weights("criticmodel.h5", overwrite=True)
        #         with open("criticmodel.json", "w") as outfile:
        #             json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
Example #7
0
class DDPG():
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """
    def __init__(self,
                 act_dim,
                 env_dim,
                 act_range,
                 k,
                 buffer_size=10000,
                 gamma=0.99,
                 lr=0.001,
                 tau=0.001):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = (1, ) + (13, )
        self.gamma = gamma
        # Create actor and critic networks
        self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
        self.critic = Critic(self.env_dim, act_dim, lr, tau)
        # self.buffer = MemoryBuffer(buffer_size)
        self.buffer = deque(maxlen=buffer_size)
        self.count = 0
        self.buffer_size = buffer_size

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        experience = (state, action, reward, done, new_state)
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def sample_batch(self, batch_size):
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch, a_batch, r_batch, d_batch, s2_batch = [], [], [], [], []
        for s_, a_, r_, d_, s2_ in batch:
            s_batch.append(s_)
            s2_batch.append(s2_)
            a_batch.append(a_)
            r_batch.append(r_)
            d_batch.append(d_)
        s_batch = np.squeeze(np.array(s_batch), axis=1)
        s2_batch = np.squeeze(np.array(s2_batch), axis=1)
        r_batch = np.reshape(np.array(r_batch), (len(r_batch), 1))
        a_batch = np.array(a_batch)

        d_batch = np.reshape(np.array(d_batch, dtype=int), (len(batch), 1))
        return s_batch, a_batch, r_batch, d_batch, s2_batch

        # # Return a batch of experience
        # s_batch = np.array([i[0] for i in batch])
        # a_batch = np.array([i[1] for i in batch])
        # r_batch = np.array([i[2] for i in batch])
        # d_batch = np.array([i[3] for i in batch])
        # new_s_batch = np.array([i[4] for i in batch])
        # return s_batch, a_batch, r_batch, d_batch, new_s_batch

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model(states)
        grads = self.critic.gradients(states, tfe.Variable(actions))
        # Train actor
        self.actor.train(states, grads)
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def format_state(self, state):
        ob_1 = np.reshape(state['observation'], (1, 10))
        de_1 = np.reshape(state['desired_goal'], (1, 3))
        return np.concatenate([ob_1, de_1], axis=1)

    def store_states(self, state, action, reward, done, info, new_state):
        # print(state['observation'].shape)
        ob_1 = np.reshape(state['observation'], (1, 10))
        ac_1 = np.reshape(state['achieved_goal'], (1, 3))
        de_1 = np.reshape(state['desired_goal'], (1, 3))
        ob_2 = np.reshape(new_state['observation'], (1, 10))
        s_1 = np.concatenate([ob_1, ac_1], axis=1)
        s2_1 = np.concatenate([ob_2, ac_1], axis=1)
        s_2 = np.concatenate([ob_1, de_1], axis=1)
        s2_2 = np.concatenate([ob_2, de_1], axis=1)
        substitute_goal = state['achieved_goal'].copy()
        substitute_reward = env.compute_reward(state['achieved_goal'],
                                               substitute_goal, info)

        e1 = (s_2, action, reward, done, s2_2)
        e2 = (s_1, action, substitute_reward, True, s2_1)
        if self.count + 2 < self.buffer_size:
            self.count += 2
        else:
            self.buffer.popleft()
            self.buffer.popleft()
        self.buffer.append(e1)
        self.buffer.append(e2)

    def train(self, env, args):
        results = []
        num_steps = 200
        # First, gather experience
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit="episode")

        avg_r_ep = 0

        best_avg = -float('inf')
        best_score = -float('inf')

        past_samples = 15
        hist_ratio = deque(maxlen=past_samples)
        hist_scores = deque(maxlen=past_samples)
        for e in tqdm_e:
            noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.act_dim))
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            s = env.reset()
            # noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            for _ in range(num_steps):
                if args.render: env.render()
                # Actor picks an action (following the deterministic policy)
                old_state = self.format_state(s)
                # print(old_state.shape)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a + noise(), -self.act_range, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                a = np.squeeze(a)
                new_state, r, done, info = env.step(a)
                dist = goal_distance(new_state['achieved_goal'],
                                     new_state['desired_goal'])
                # new_state = new_state['observation']

                # Add outputs to memory buffer
                self.store_states(s, a, r, done, info, new_state)

                s = new_state
                cumul_reward += r

                # Sample experience from buffer
                states, actions, rewards, dones, new_states = self.sample_batch(
                    args.batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states,
                     self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state

                if done:
                    break

            if avg_r_ep == 0:
                avg_r_ep = cumul_reward
            else:
                avg_r_ep = avg_r_ep * 0.99 + cumul_reward * 0.01

            if avg_r_ep >= best_avg:
                best_avg = avg_r_ep
                self.actor.model.save_weights(
                    'pretrained/best_avg_ddpgActor.h5')
                self.critic.model.save_weights(
                    'pretrained/best_avg_ddpgCritic.h5')
            # Display score
            if cumul_reward >= best_score:
                best_score = cumul_reward
                self.actor.model.save_weights('pretrained/ddpgActor.h5')
                self.critic.model.save_weights('pretrained/ddpgCritic.h5')

            hist_ratio.append(int(dist <= 0.05))
            hist_scores.append(cumul_reward)

            tqdm_e.set_description(
                "Score: {} | "
                "Best Reward: {} (avg: {:.2f})| "
                "Avg Reward, solve ratio over last {} samples: {:.3f}, {:.3f}".
                format(cumul_reward, np.amax(hist_scores),
                       avg_r_ep, past_samples, np.mean(hist_scores),
                       np.mean(hist_ratio)))
            tqdm_e.refresh()

        return results

    def eval(self, env, model_name='', random=False, render=False):
        if not random:
            self.actor.model.load_weights('pretrained/' + model_name +
                                          'Actor.h5')
            self.critic.model.load_weights('pretrained/' + model_name +
                                           'Critic.h5')
        score = 0
        solve_count = 0
        tr = tqdm(range(100))
        avg_time = 0
        for ep in tr:
            state = env.reset()

            for t in range(50):
                if render:
                    env.render()
                if random:
                    a = env.action_space.sample()
                else:
                    a = self.policy_action(self.format_state(state))[0]

                state, r, done, info = env.step(a)
                d = goal_distance(state['achieved_goal'],
                                  state['desired_goal'])
                done = d <= 0.05
                if done:
                    solve_count += 1
                    break
                score += r
            tr.set_description("Solve percentage: {:.3f}".format(solve_count /
                                                                 (ep + 1)))
            avg_time += t
        print("average time to solve:", avg_time / 100.0)
        return score / 100.0