Esempio n. 1
0
obs_img = np.zeros((50, 50))
obs_dis = last_distance
obs_ori = orientation

new_obs_img = np.zeros((50, 50))
new_obs_dis = last_distance
new_obs_ori = orientation

##################################################
''' We create the policy network (the Actor model) '''
policy = TD3(action_dim, max_action)

##################################################
''' We create the Experience Replay memory'''
replay_buffer = ReplayBuffer(sample_size=sample_size)

##################################################
im = CoreImage("./images/MASK1.png")

imgCV2 = cv2.imread('./images/MASK1.png')
rows, cols, dims = imgCV2.shape

# Initializing the map
first_update = True


def init():
    global sand
    global goal_x
    global goal_y
Esempio n. 2
0
episode_reward = 0
maxepisode_timesteps = 500

torch.manual_seed(seed)
np.random.seed(seed)
state_dim = 5
action_dim = 1
max_action = 5
min_action = -5

# Getting our AI, which we call "brain", and that contains our neural network that represents our Q-function
#brain = Dqn(5,3,0.9)
action2rotation = [0, 5, -5]
#spacenetwork = ObsSpaceNetwork()
policy = TD3(state_dim, action_dim, max_action)
replay_buffer = ReplayBuffer()
last_reward = 0
scores = []
im = CoreImage("./images/MASK1.png")
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
episode_timesteps = 0
done = True
t0 = time.time()

# textureMask = CoreImage(source="./kivytest/simplemask1.png")

# Initializing the map
first_update = True
i = 0
Esempio n. 3
0
    def update(self, dt):

        global longueur
        global largeur

        longueur = self.width
        largeur = self.height
        if first_update:
            init()

        def evaluate_policy(policy, eval_episodes=10):
            avg_reward = 0.
            for _ in range(eval_episodes):
                obs = reset(self)
                done = False
                while not done:
                    action = policy.select_action(obs)
                    obs, reward, done, _ = Car.move(action)
                    avg_reward += reward
            avg_reward /= eval_episodes
            print("---------------------------------------")
            print("Average Reward over the Evaluation Step: %f" % (avg_reward))
            print("---------------------------------------")
            return avg_reward

        file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
        print("---------------------------------------")
        print("Settings: %s" % (file_name))
        print("--------------------------------------  -")

        if not os.path.exists("./results"):
            os.makedirs("./results")
        if save_models and not os.path.exists("./pytorch_models"):
            os.makedirs("./pytorch_models")

        torch.manual_seed(seed)
        np.random.seed(seed)

        state_dim = [32, 32, 1]
        action_dim = 1
        max_action = 5

        policy = TD3(state_dim, action_dim, max_action)

        replay_buffer = ReplayBuffer()

        evaluations = [evaluate_policy(policy)]

        def mkdir(base, name):
            path = os.path.join(base, name)
            if not os.path.exists(path):
                os.makedirs(path)
            return path

        work_dir = mkdir('exp', 'brs')
        monitor_dir = mkdir(work_dir, 'monitor')
        max_episode_steps = 400

        total_timesteps = 0
        timesteps_since_eval = 0
        episode_num = 0
        done = True
        t0 = time.time()

        # We start the main loop over 40,000 timesteps
        while total_timesteps < max_timesteps:

            # If the episode is done
            if done:

                # If we are not at the very beginning, we start the training process of the model
                if (total_timesteps != 0 and total_timesteps > (batch_size)):
                    print("Total Timesteps: {} Episode Num: {} Reward: {}".
                          format(total_timesteps, episode_num, episode_reward))
                    policy.train(replay_buffer, episode_timesteps, batch_size,
                                 discount, tau, policy_noise, noise_clip,
                                 policy_freq)

                # We evaluate the episode and we save the policy
                if timesteps_since_eval >= eval_freq:
                    timesteps_since_eval %= eval_freq
                    evaluations.append(evaluate_policy(policy))
                    policy.save(file_name, directory="./pytorch_models")
                    np.save("./results/%s" % (file_name), evaluations)

                # When the training step is done, we reset the state of the environment
                obs = reset()

                # Set the Done to False
                done = False

                # Set rewards and episode timesteps to zero
                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1

            # Before 10000 timesteps, we play random actions
            if total_timesteps < start_timesteps:
                action = np.random.normal(0, 1,
                                          size=1).clip(-1,
                                                       1).astype(np.float32)
            else:  # After 10000 timesteps, we switch to the model
                action = policy.select_action(obs)
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if expl_noise != 0:
                    action = (action +
                              np.random.normal(0, expl_noise, size=1)).clip(
                                  -1, 1)

            # The agent performs the action in the environment, then reaches the next state and receives the reward
            new_obs, reward, done, _ = move(action)

            # We check if the episode is done
            # done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
            if episode_timesteps + 1 == max_episode_steps:
                done = True
            done = float(done)
            # We increase the total reward
            episode_reward += reward

            # We store the new transition into the Experience Replay memory (ReplayBuffer)
            replay_buffer.add((obs, new_obs, action, reward, done_bool))

            # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
            obs = new_obs
            episode_timesteps += 1
            total_timesteps += 1
            timesteps_since_eval += 1

        t1 = time.time()
        print("Total time  taken: {}".format(t1 - t0))
        evaluations.append(evaluate_policy(policy))
        if save_models:
            policy.save("%s" % (file_name), directory="./pytorch_models")
        np.save("./results/%s" % (file_name), evaluations)
        CarApp().stop()