Ejemplo n.º 1
0
def main(unused_argv):
    del unused_argv

    configs = extract_configs(*FLAGS.eval_config)
    # instantiate the Banana env
    env = BananaWrapper(file_name="./Banana")
    state_size = env.observation_size
    action_size = env.action_size
    # instantiate agent object
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  configs=configs)

    # load trained model
    agent.qnetwork_local.load_state_dict(
        torch.load("results/checkpoints/DoubleDQN.pth"))

    horizon = 1000
    episodes = 5
    for _ in range(episodes):
        state = env.reset()
        for _ in range(horizon):
            # Perform action given the trained policy
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            time.sleep(0.05)
            if done:
                break
            state = next_state

    # close env
    env.close()
Ejemplo n.º 2
0
def main():

    ################################################
    # components requred from main_02.py
    ################################################

    # spin up environment
    env = gym.make('LunarLander-v2')
    env.seed(0)

    # spin up agent (with underlying nn model)
    agent = Agent(state_size=8, action_size=4, seed=0)

    ################################################
    # Import trained agent and render performance
    ################################################

    # load the weights from file
    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(10):
        state = env.reset()
        img = plt.imshow(env.render(mode='rgb_array'))
        for j in range(400):
            action = agent.act(state)
            img.set_data(env.render(mode='rgb_array'))
            plt.axis('off')
            state, reward, done, _ = env.step(action)
            if done:
                break

    env.close()
def DQN_gif(file_name):
    env = gym.make('LunarLander-v2')
    env.seed(0)

    agent = Agent(state_size=8, action_size=4, seed=0)
    agent.qnetwork_local.load_state_dict(
        torch.load('checkpoint.pth',
                   map_location=lambda storage, loc: storage))

    images = []
    state = env.reset()
    img = env.render(mode='rgb_array')
    for j in range(200):
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        frame = env.render(mode='rgb_array')

        pil_img = Image.fromarray(frame)
        draw = ImageDraw.Draw(pil_img)
        text = 'Step = {}\nReward = {}'.format(j + 1, reward)
        draw.text((20, 20), text, (255, 255, 255))

        images.append(np.asarray(pil_img))

        if done:
            break
    imageio.mimsave(file_name, images)
Ejemplo n.º 4
0
def main():
    env = UnityEnvironment(file_name="./../Banana.app")

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # Instantiate agent:
    env_info = env.reset(train_mode=False)[brain_name]
    #agent = Agent(state_size=8, action_size=4, seed=0)
    action_size = brain.vector_action_space_size
    state = env_info.vector_observations[0]
    state_size = len(state)
    agent = Agent(state_size=state_size, action_size=action_size, seed=0)

    # load the weights from file
    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(3):
        #state = env.reset()
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]
        for j in range(200):
            action = agent.act(state)
            #state, reward, done, _ = env.step(action)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            if done:
                break
            state = next_state

    env.close()
Ejemplo n.º 5
0
    def test(self, run_id=5):
        agent = Agent(state_size=37, action_size=4, seed=0)

        run_dir = "results/{}".format(run_id)

        # load the weights from file
        agent.qnetwork_local.load_state_dict(
            torch.load("{}/checkpoint.pth".format(run_dir)))

        for i in range(5):

            env_info = self.env.reset(
                train_mode=False)[self.brain_name]  # reset the environment
            state = env_info.vector_observations[0]

            for j in range(50):
                action = agent.act(state)

                env_info = self.env.step(action)[
                    self.brain_name]  # send the action to the environment
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished

                if done:
                    break
Ejemplo n.º 6
0
def main(
    file_name="/Users/joshuaschoenfield/Downloads/Banana.app",
    weights_file="checkpoint_banana_2_LONG_SAFE.pth",
):
    with get_environment(file_name=file_name) as env:
        from dqn_agent import Agent

        agent = Agent(state_size=37, action_size=4, seed=0)

        agent.qnetwork_local.load_state_dict(torch.load(weights_file))
        scores = []
        num_iterations = 100
        for i in range(num_iterations):
            state = reset_and_get_first_state(env, train_mode=True)
            score = 0
            for j in range(2000):
                action = agent.act(state, eps=0)
                # env.render()
                state, reward, done = get_next_state_reward_done(env, action)
                score += reward
                if done:
                    break
            scores.append(score)
            # print(f"Score: {score}")
        # print(f"Average Score: {np.mean(scores)}")
        ax = plot_score_cumulative_distribution(scores)
        ax.figure.savefig("Media/validation_scores_cumulative.png")
        # plt.show()
        np.savetxt("validation_scores.txt", scores)
        return scores
Ejemplo n.º 7
0
def process(args):
    env = UnityEnvironment(file_name="Banana.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]  # get the current state
    score = 0  # initialize the score
    action_size = brain.vector_action_space_size
    state_size = len(state)

    agent = Agent(state_size, action_size, 1, args.model_path)

    while True:
        action = agent.act(state, 0.0)  # select an action
        env_info = env.step(action)[
            brain_name]  # send the action to the environment
        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        score += reward  # update the score
        state = next_state  # roll over the state to next time step
        if done:  # exit loop if episode finished
            break

    print("Score: {}".format(score))
Ejemplo n.º 8
0
def test(n_epi):
    agent = Agent(state_size=37, action_size=4, seed=0)
    env = UnityEnvironment(file_name="Banana.app")
    brain_name = env.brain_names[0]  # get the default brain
    brain = env.brains[brain_name]
    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(n_epi):
        scores = []  # list containing scores from each episode
        scores_window = deque(maxlen=100)  # last 100 scores
        score = 0  # initialize the score
        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        while True:
            action = agent.act(state)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            score += reward  # update the score
            done = env_info.local_done[0]  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state  # roll over the state to next time step
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i, np.mean(scores_window)))

    env.close()
Ejemplo n.º 9
0
def main():
    agent = Agent(state_size=3, action_size=8, seed=0)

    start_pos = (200, 600)
    end_pos = (800, 375)
    #start_pos = (200,500)
    #end_pos = (800,500)
    env = environment(MAP, start_pos, end_pos)
    """
	x_end, y_end = end_pos
	plt.figure(figsize=(10,6), dpi=200)
	plt.plot(start_pos[0], start_pos[1], 'rx')
	plt.plot(x_end, y_end, 'bx')
	plt.contourf(np.array(MAP), linestyles='dashed')
	#plt.imshow(np.array(MAP))
	plt.gca().set_aspect('equal', adjustable='box')
	plt.colorbar()
	plt.show()
	sys.exit(())
	"""

    # load the weights from file
    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    for i in range(1):
        path_x = [start_pos[0]]
        path_y = [start_pos[1]]

        state, _, _ = env.reset(start_pos, end_pos)
        for j in range(6000):
            action = agent.act(state)

            #print (j, action)
            print(j)
            #if j%100 == 0:
            #	env.render()

            state, reward, done = env.step(action)

            path_x.append(state[0])
            path_y.append(state[1])

            if done:
                break

        print(done)
        x_end, y_end = end_pos
        plt.figure(figsize=(10, 6), dpi=200)
        plt.plot(path_x, path_y, 'ro', markevery=20)
        plt.plot(x_end, y_end, 'bx')
        plt.contourf(np.array(MAP), linestyles='dashed')
        #plt.imshow(np.array(MAP))
        plt.gca().set_aspect('equal', adjustable='box')
        plt.colorbar()
        plt.show()

    env.close()
Ejemplo n.º 10
0
def dqn(LR,
        GAMMA,
        TAU,
        BUFF,
        UPD,
        n_episodes=1000,
        max_t=100,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """

    agent = Agent(state_size, action_size, LR, GAMMA, TAU, BUFF, UPD, seed=0)
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
            #        if np.mean(scores_window)>=13.0:
            #            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            #break
#    return scores
    return np.mean(scores_window)
Ejemplo n.º 11
0
def dqn(args, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        args : command line arguments
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    state_size = 37
    action_size = 4
    agent = Agent(state_size, action_size, 1)
    for i_episode in range(1, args.num_episodes + 1):
        #resetting the environment for a new episode
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        cnt = 0
        while True:
            action = agent.act(state, eps)
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            cnt += 1
            if done:
                break
        scores_window.append(
            score)  # save most recent score in the 100 episode window
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score in the last 100 episodes: {:.2f}'.
              format(i_episode, np.mean(scores_window)),
              end="")
        if i_episode % args.save_every == 0:
            print(
                '\nSaving Checkpoint for {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_window)))
            torch.save(
                agent.qnetwork_local.state_dict(),
                os.path.join(args.save_checkpoint_path,
                             'checkpoint_' + str(i_episode) + '.pth'))
    return scores
Ejemplo n.º 12
0
def test(dev, weights_file, n_episodes=100, max_t=1000):
    """Test the environment with the parameters stored in weights_file

    Params
    ======
        dev (string): cpu or gpu
        weights_file (string): name of the file to load the weights
        n_episodes (int): number of test episodes that will be performed
        max_t (int): maximum number of timesteps per episode
    """
    env = UnityEnvironment(file_name='./Banana_Linux/Banana.x86_64')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=False)[brain_name]
    state_size = len(env_info.vector_observations[0])
    action_size = brain.vector_action_space_size
    agent = Agent(state_size, action_size, seed=0, device=dev)

    # load the weights from file
    print('Loading weights')
    try:
        checkpoint = torch.load(weights_file)
    except FileNotFoundError:
        print('Error: File \'{}\' not found'.format(weights_file))
        sys.exit(1)

    agent.qnetwork_local.load_state_dict(checkpoint)
    scores = []
    print('Running {} episodes'.format(n_episodes))
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=False)[brain_name]
        score = 0
        state = env_info.vector_observations[0]
        for j in range(max_t):
            action = agent.act(state)
            env_info = env.step(action)[brain_name]
            state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            score += reward
            if done:
                break
        scores.append(score)
        if (i_episode % 100 != 0):
            print('\rEpisode {}\tScore: {:.0f}\tAverage Score: {:.2f}'.format(
                i_episode, score, np.mean(scores)),
                  end="")
        else:
            print('\rEpisode {}\tScore: {:.0f}\tAverage Score: {:.2f}'.format(
                i_episode, score, np.mean(scores)))

    env.close()
Ejemplo n.º 13
0
def dqn(agent: Agent, params: Params):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """

    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = params.eps_start  # initialize epsilon
    for i_episode in range(1, params.n_episodes + 1):
        agent.init_episode()
        score = 0  # initialize the score
        for t in range(params.max_t):
            agent.act(eps)  # to be defined in the agent
            agent.step()  # to be defined in the agent
            score += agent.get_reward()
            if agent.get_done():
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(params.eps_end, params.eps_decay * eps)  # decrease epsilon
        # print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))
        if np.mean(scores_window) >= 200.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break
    return scores
Ejemplo n.º 14
0
def play_banana(isDoubleDQN=0):
    isDoubleDQN = int(isDoubleDQN)
    # find the path to the environment, this can be different for different OS
    env = UnityEnvironment(file_name="Banana_Windows_x86_64\Banana.exe")

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of actions
    action_size = brain.vector_action_space_size
    # examine the state space 
    state = env_info.vector_observations[0]
    state_size = len(state)

    # instantiate agent
    agent = Agent(state_size=state_size, action_size=action_size, seed=0, isDoubleDQN=isDoubleDQN)

    # load the weights from file
    if (isDoubleDQN==1):
        print("Using Double DQN")
        agent.qnetwork_local.load_state_dict(torch.load('checkpoint_double_agent.pth'))
    else:
        print("Not Using Double DQN")
        agent.qnetwork_local.load_state_dict(torch.load('checkpoint_simple_agent.pth'))  

    # start the agent
    env_info = env.reset(train_mode=False)[brain_name] # reset the environment
    state = env_info.vector_observations[0]            # get the current state
    score = 0                                          # initialize the score
    while True:
        action = agent.act(state, eps=0)      # select an action
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        done = env_info.local_done[0]                  # see if episode has finished
        score += reward                                # update the score
        state = next_state                             # roll over the state to next time step
        if done:                                       # exit loop if episode finished
            break
        
    print("Score: {}".format(score))

    env.close()
Ejemplo n.º 15
0
def trainFunction(n_episodes=2000,
                  max_t=1000,
                  eps_start=1.0,
                  eps_end=0.01,
                  eps_decay=0.995):
    agent = Agent(state_size=37, action_size=4, seed=0, priority=True)
    epsilons = []
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            env_info = env.step(action.astype(np.int32))[brain_name]
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        epsilons.append(eps)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
        # if np.mean(scores_window)>=13.0:

    print('\nEnvironment finished in {:d} episodes!\tAverage Score: {:.2f}'.
          format(i_episode, np.mean(scores_window)))
    torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
    return scores, epsilons
Ejemplo n.º 16
0
def testFunction():
    agent = Agent(state_size=37, action_size=4, seed=0)
    agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]  # get the current state
    score = 0  # initialize the score
    time_steps = 100000
    for t in range(time_steps):
        action = agent.act(state)  # select an action
        env_info = env.step(action.astype(
            np.int32))[brain_name]  # send the action to the environment
        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        score += reward  # update the score
        state = next_state  # roll over the state to next time step
        if done:  # exit loop if episode finished
            break

    print("Score: {}".format(score))
Ejemplo n.º 17
0
def run(agent_source, location, n_episodes):

    source = AgentSource[agent_source.upper()]
    agent = Agent(state_size=8,
                  action_size=4,
                  seed=0,
                  learning_strategy=LearningStrategy.DQN)

    path_to_agent_checkpoint = retrieve_agent_checkpoint(source, location)
    agent.qnetwork_local.load_state_dict(
        torch.load(path_to_agent_checkpoint,
                   map_location=lambda storage, loc: storage))

    #display = Display(visible=0, size=(1400, 900))
    #display.start()

    env = gym.make('LunarLander-v2')
    env.seed(0)
    print('State shape: ', env.observation_space.shape)
    print('Number of actions: ', env.action_space.n)

    for i in range(n_episodes):
        state = env.reset()
        #img = plt.imshow(
        env.render(mode='rgb_array')
        for j in range(500):
            action = agent.act(state)
            #img.set_data(
            env.render(mode='rgb_array')
            plt.axis('off')
            time.sleep(0.1)
            #display.display(plt.gcf())
            #display.clear_output(wait=True)
            state, reward, done, _ = env.step(action)
            if done:
                break

    env.close()
Ejemplo n.º 18
0
def testAgent():
    print("Testing the Agent")
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  seed=0,
                  pretrainedWeightsFile='checkpoint.pth',
                  train=False)
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]  # get the current state
    score = 0  # initialize the score
    while True:
        action = agent.act(state)  # select an action
        env_info = env.step(
            action.item())[brain_name]  # send the action to the environment
        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        score += reward  # update the score
        state = next_state  # roll over the state to next time step
        if done:  # exit loop if episode finished
            break
    print("Score: {}".format(score))
    return score
Ejemplo n.º 19
0
def train(n_episodes=2000, eps_start=1.0, eps_end=0.025, eps_decay=0.995):
    agent = Agent(state_size=37, action_size=4, seed=0)
    env = UnityEnvironment(file_name="Banana.app")
    brain_name = env.brain_names[0]                    # get the default brain
    brain = env.brains[brain_name]
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon

    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0]            # get the current state
        score = 0                                          # initialize the score
        while True:
            action = agent.act(state, eps)                 # select an action
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_state = env_info.vector_observations[0]   # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            agent.step(state, action, reward, next_state, done)
            score += reward                                # update the score
            state = next_state                             # roll over the state to next time step
            if done:                                       # exit loop if episode finished
                break
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=13.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_Nav_V01_13.pth')
            env.close()    
            break
    return scores    
Ejemplo n.º 20
0
def demo4_LearningPathPlanning(setting):

    n_sample = 100

    # Environment
    env = FireEnvironment(64, 64)
    # Vehicle to generate observation mask
    vehicle = Vehicle(n_time_windows=512,
                      grid_size=(64, 64),
                      planner_type='Default')
    # Trainer and Estimator
    dyn_autoencoder = DynamicAutoEncoder(SETTING,
                                         grid_size=(env.map_width,
                                                    env.map_height),
                                         n_state=3,
                                         n_obs=3,
                                         encoding_dim=16,
                                         gru_hidden_dim=16)
    ### DQN agent
    dqn_agent = DQN_Agent(state_size=16,
                          action_size=4,
                          replay_memory_size=1000,
                          batch_size=64,
                          gamma=0.99,
                          learning_rate=0.01,
                          target_tau=0.01,
                          update_rate=1,
                          seed=0)
    # Train Data Buffer
    memory = SingleTrajectoryBuffer(N_MEMORY_SIZE)
    # Train Iteration Logger

    writer = SummaryWriter()
    # Video Writier
    video_writer1 = ImageStreamWriter('LearningPlanner.avi',
                                      FPS,
                                      image_size=(1200, 820))

    # Add concat. text
    setting_text = ''
    for k, v in setting.items():
        setting_text += k
        setting_text += ':'
        setting_text += str(v)
        setting_text += '\t'
    writer.add_text('setting', setting_text)

    ########################################
    ### Interacting with the Environment ###
    ########################################
    mask_obs, obs, state = env.reset()
    state_est_grid = dyn_autoencoder.u_k

    ### Loss Monitors ###
    list_loss = []
    list_cross_entropy_loss = []
    list_entropy_loss = []
    list_rewards = []
    list_new_fire_count = []
    list_action = []

    ### Filling the Data Buffer ###
    for i in tqdm.tqdm(range(N_TRAIN_WAIT)):
        map_visit_mask, img_resized = vehicle.full_mask()
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(),
                   state.detach().long(),
                   map_visit_mask.detach().long())

    for i in tqdm.tqdm(range(N_TOTAL_TIME_STEPS)):

        # determine epsilon-greedy action from current sate
        h_k = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()
        epsilon = 0.1
        action = dqn_agent.act(h_k, epsilon)
        list_action.append(action)

        ### Collect Data from the Env. ###
        map_visit_mask, img_resized = vehicle.plan_a_trajectory(
            state_est_grid, n_sample, action)
        mask_obs, obs, state, reward, info = env.step(map_visit_mask)
        memory.add(mask_obs.detach().long(),
                   state.detach().long(),
                   map_visit_mask.detach().long())

        ### Run the Estimator ###
        state_est_grid = dyn_autoencoder.step(mask_obs, map_visit_mask)
        h_kp1 = dyn_autoencoder.h_k.squeeze().data.cpu().numpy()

        #### Update the reinforcement learning agent ###
        dqn_agent.step(h_k, action, reward, h_kp1, done=False)

        list_rewards.append(reward)
        list_new_fire_count.append(info['new_fire_count'])

        ################################
        ### Rendering and Save Video ###
        ################################
        img_env = env.output_image()
        img_agent = dyn_autoencoder.output_image(state_est_grid)

        # State Est
        #blank = np.zeros((400, 200, 3))
        img_top = img_env  #np.concatenate((blank, img_env[:,:800], blank), axis=1)
        blank = np.zeros((20, 1200, 3))
        img_top = np.concatenate((img_top, blank), axis=0)
        img_top = (img_top * 255).astype('uint8')

        img_state_est_grid_uint8 = (img_agent * 255).astype('uint8')
        backtorgb = cv2.cvtColor(img_state_est_grid_uint8, cv2.COLOR_GRAY2RGB)
        img_bayes_uint8 = np.concatenate((img_top, backtorgb),
                                         axis=0)  #<-- to be saved
        render('Dynamic Auto Encoder', img_bayes_uint8, 1)

        # Save video #
        video_writer1.write_image_frame(img_bayes_uint8)

        ### Training ###
        loss_val, loss_val_cross, loss_val_ent, O_np_val = dyn_autoencoder.update(
            memory, N_TRAIN_BATCH, N_TRAIN_WINDOW)
        list_loss.append(loss_val)
        list_cross_entropy_loss.append(loss_val_cross)
        list_entropy_loss.append(loss_val_ent)

        if i % N_LOGGING_PERIOD == 0:
            avg_loss = np.mean(np.array(list_loss))
            list_loss = []
            writer.add_scalar('dynautoenc/loss', avg_loss, i)

            avg_loss_cross = np.mean(np.array(list_cross_entropy_loss))
            list_cross_entropy_loss = []
            writer.add_scalar('dynautoenc/crossentropy', avg_loss_cross, i)

            avg_loss_entropy = np.mean(np.array(list_entropy_loss))
            list_entropy_loss = []
            writer.add_scalar('dynautoenc/shannonentropy', avg_loss_entropy, i)

            avg_reward = np.mean(np.array(list_rewards))
            list_rewards = []
            writer.add_scalar('perform/rewards', avg_reward, i)

            avg_new_fire_count = np.mean(np.array(list_new_fire_count))
            list_new_fire_count = []
            writer.add_scalar('perform/new_fire_counts', avg_new_fire_count, i)

            writer.add_scalar('perform/pc_coverd_new_fire',
                              avg_reward / avg_new_fire_count, i)

            action_0_count = list_action.count(0)
            action_1_count = list_action.count(1)
            action_2_count = list_action.count(2)
            action_3_count = list_action.count(3)

            writer.add_scalar('action_count/0',
                              action_0_count / len(list_action), i)
            writer.add_scalar('action_count/1',
                              action_1_count / len(list_action), i)
            writer.add_scalar('action_count/2',
                              action_2_count / len(list_action), i)
            writer.add_scalar('action_count/3',
                              action_3_count / len(list_action), i)
            list_action = []

            writer.add_scalar('obs_state0/o00', O_np_val[0][0], i)
            writer.add_scalar('obs_state1/o01', O_np_val[0][1], i)
            writer.add_scalar('obs_state2/o02', O_np_val[0][2], i)
            writer.add_scalar('obs_state0/o10', O_np_val[1][0], i)
            writer.add_scalar('obs_state1/o11', O_np_val[1][1], i)
            writer.add_scalar('obs_state2/o12', O_np_val[1][2], i)
            writer.add_scalar('obs_state0/o20', O_np_val[2][0], i)
            writer.add_scalar('obs_state1/o21', O_np_val[2][1], i)
            writer.add_scalar('obs_state2/o22', O_np_val[2][2], i)

            print(
                'losses at iteration: %d, losses: total %.3f, cross %.3f, shannon %.3f'
                % (i, avg_loss, avg_loss_cross, avg_loss_entropy))
            print('memory size at iteration: %d, size: %d' %
                  (i, len(memory.obs_memory)))

        if (i + 1) % N_SAVING_PERIOD == 0:
            f_name = setting['name']
            dyn_autoencoder.save_the_model(i, f_name)
            dqn_agent.save_the_model(i, f_name)

    video_writer1.close()
Ejemplo n.º 21
0
class DQN():
    # env assumption: env.reset(), env.render(), env.step(), env.close()
    def __init__(self, name, state_size, action_size, env, load_net=False):
        self.agent = Agent(name,
                           state_size=state_size,
                           action_size=action_size,
                           seed=0)
        self.env = env
        self.saved_network = name + '_dqn_checkpoint.pth'
        self.load_net = load_net
        if load_net:
            print('Loading pretrained network...')
            self.agent.qnetwork_local.load_state_dict(
                torch.load(self.saved_network))
            self.agent.qnetwork_target.load_state_dict(
                torch.load(self.saved_network))
            print('Loaded.')

    def train(self,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995,
              score_window_size=100,
              target_score=13.0,
              save=True,
              verbose=True):
        """Deep Q-Learning.

            Params
            ======
                n_episodes (int): maximum number of training episodes
                max_t (int): maximum number of timesteps per episode
                eps_start (float): starting value of epsilon, for epsilon-greedy action selection
                eps_end (float): minimum value of epsilon
                eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
            """
        scores = []  # list containing scores from each episode
        scores_window = deque(
            maxlen=score_window_size)  # last score_window_size scores
        eps = eps_start  # initialize epsilon
        save12 = False
        for i_episode in range(1, n_episodes + 1):
            state = self.env.reset()
            score = 0
            for t in range(max_t):
                action = self.agent.act(state, eps)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            avg_score = np.mean(scores_window)
            if avg_score > 13.0 and not save12 and not self.load_net:
                torch.save(self.agent.qnetwork_local.state_dict(),
                           self.saved_network)
                np.save('scores13_0824.npy', np.array(scores))
                save12 = True
            if avg_score >= target_score and i_episode > 100:
                if verbose:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(i_episode, np.mean(scores_window)))
                self.solved = True
                if save:
                    torch.save(self.agent.qnetwork_local.state_dict(),
                               self.saved_network)
                break

            if verbose:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)),
                      end="")
                if i_episode % 100 == 0:
                    print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                        i_episode, np.mean(scores_window)))

        if save:
            torch.save(self.agent.qnetwork_local.state_dict(),
                       self.saved_network)

        return scores

    def play(self, trials=3, steps=200, load=False):
        if load:
            self.agent.qnetwork_local.load_state_dict(
                torch.load(self.saved_network))

        for i in range(trials):
            total_reward = 0
            print('Start Trial...')
            state = self.env.reset()
            for j in range(steps):
                action = self.agent.act(state)
                self.env.render()
                state, reward, done, _ = self.env.step(action)
                total_reward += reward
                if reward != 0:
                    print("Current Reward:", reward, "Total Reward:",
                          total_reward)
                if done:
                    print('Done.')
                    break
        self.env.close()
print('Number of actions: ', env.action_space.n)


# Please refer to the instructions in `Deep_Q_Network.ipynb` if you would like to write your own DQN agent.  Otherwise, run the code cell below to load the solution files.

# In[4]:


from dqn_agent import Agent

agent = Agent(state_size=8, action_size=4, seed=0)

# watch an untrained agent
state = env.reset()
for j in range(200):
    action = agent.act(state)
    env.render()
    state, reward, done, _ = env.step(action)
    if done:
        break 
        
env.close()


# ### 3. Train the Agent with DQN
# 
# Run the code cell below to train the agent from scratch.  You are welcome to amend the supplied values of the parameters in the function, to try to see if you can get better performance!
# 
# Alternatively, you can skip to the next step below (**4. Watch a Smart Agent!**), to load the saved model weights from a pre-trained agent.

# In[5]:
Ejemplo n.º 23
0
brain_name = env.brain_names[0]
brain = env.brains[brain_name]


agent = Agent(state_size=brain.vector_observation_space_size, action_size=brain.vector_action_space_size, seed=0)

# Load trained model weights
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0
eps = 0.0

# initialize the score
while True:
    #replace: action = np.random.randint(action_size)        # select an action
    action = agent.act(state, eps)
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
print(f"Score: {score}")

# when finish testing, close the environment
env.close()
Ejemplo n.º 24
0
def run(experiment_name, num_iterations, learning_rate, buffer_size,
        batch_size, gamma, epsilon, epsilod_decay, epsilon_min, stack_size,
        device, is_ddqn, evaluation_rate, log_directory):
    scores = []

    episodic_accum = 0
    epsidoic_rewards = []
    iteration_rewards = []
    episode = 1

    agent = Agent(env=env, state_space=state_space, action_space=action_space, learning_rate=learning_rate,\
                     buffer_size=buffer_size, batch_size=batch_size, gamma=gamma,\
                     device=device, in_channels=stack_size, is_ddqn = is_ddqn)

    #initializing log directory for tensorboard
    if not os.path.exists(log_directory):
        os.makedirs(log_directory)
    tb_writer = SummaryWriter('{}/{}'.format(log_directory, experiment_name))

    frame_count = 0
    epoch_plot_count = 0
    stop = False
    prev_iteration = None
    while agent.num_train_updates < num_iterations + 1 and not stop:
        state = env.reset()
        done = False

        # current state & 3-previous states
        state_frames = deque(maxlen=stack_size)

        episode_reward = []

        while not done:
            frame_count += 1

            _state = preprocess_state(state)
            state = torch.from_numpy(_state).float()

            # if it's the first frame, copy the same state multiple time in the stack
            if len(state_frames) < stack_size:
                for i in range(stack_size):
                    state_frames.append(state)
            else:
                state_frames.append(state)

            state_stack = torch.stack(list(state_frames)).unsqueeze(dim=0)
            action = agent.act(state_stack, epsilon)
            prev_action = action

            next_state, reward, done, info = env.step(action)
            _next_state = preprocess_state(next_state)
            _next_state = torch.from_numpy(_next_state).float()
            agent.step(state_frames.__copy__(), action, reward, _next_state,
                       done)
            state = next_state

            episodic_accum += reward
            iteration_rewards.append(reward)

            if agent.num_train_updates > 0:
                # evaluate every 1M steps and decay epsilon (based on paper)
                if agent.num_train_updates % evaluation_rate == 0 and prev_iteration != agent.num_train_updates:
                    epsilon = max(epsilon_min, epsilon * epsilod_decay)
                    prev_iteration = agent.num_train_updates

            if agent.num_train_updates > num_iterations:
                stop = True

        episode += 1
        epsidoic_rewards.append(episodic_accum)
        episodic_accum = 0.

        if episode % 100 == 0 and len(epsidoic_rewards) > 20:
            tb_writer.add_scalar('Episode Accum score',
                                 np.mean(epsidoic_rewards[-20:]), episode)
            print('episode_num:{}\tepisode_score:{}\tepsilon:{}\tmemory_size:{}'.format(\
                  episode, np.mean(epsidoic_rewards[-20:]), epsilon,len(agent.memory)))
            torch.save(agent.QNetwork_local.state_dict(),
                       '{}_checkpoint.pth'.format(experiment_name))
    return epsidoic_rewards
Ejemplo n.º 25
0
def dqn(n_episodes=4000,
        max_t=3000,
        eps_start=1.0,
        eps_end=0.01,
        eps_decay=0.995):
    agent = Agent(state_size=3, action_size=8, seed=0)

    start_pos = (200, 600)
    end_pos = (800, 375)
    env = environment(MAP, start_pos, end_pos)
    """Deep Q-Learning.
	
	Params
	======
		n_episodes (int): maximum number of training episodes
		max_t (int): maximum number of timesteps per episode
		eps_start (float): starting value of epsilon, for epsilon-greedy action selection
		eps_end (float): minimum value of epsilon
		eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
	"""
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon

    for i_episode in range(1, n_episodes + 1):
        state, _, _ = env.reset(start_pos, end_pos)
        score = 0

        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward

            if done:
                #print (state)
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
            i_episode, np.mean(scores_window)),
              end="")

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))

        #if np.mean(scores_window)>=200.0:
        #print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))

        if i_episode % 200 == 0:
            torch.save(agent.qnetwork_local.state_dict(),
                       'checkpoint' + str(i_episode) + '.pth')

        #torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
        #break

    return scores
Ejemplo n.º 26
0
    state = env_info.vector_observations[0]

    # set the initial episode score to zero.
    score = 0

    # Run the episode training loop;
    # At each loop step take an epsilon-greedy action as a function of the current state observations
    # Based on the resultant environmental state (next_state) and reward received update the Agent network
    # If environment episode is done, exit loop...
    # Otherwise repeat until done == true
    converted_action_size = brain.vector_action_space_size
    converted_agent_num = len(env_info.agents)

    while True:
        # determine epsilon-greedy action from current sate
        action = agent.act(state, epsilon)

        # if round(action) == 0:
        #     converted_action = np.array([[1,0,0,0]])
        # elif round(action) == 1:
        #     converted_action = np.array([[-1,0,0,0]])
        # elif round(action) == 2:
        #     converted_action = np.array([[0,0,-1,0]])
        # elif round(action) == 3:
        #     converted_action = np.array([[0,0,1,0]])
        if round(action) == 0:
            converted_action = np.array([[1, 0, 0, 0]])  # forward
        elif round(action) == 1:
            converted_action = np.array([[2, 0, 0, 0]])  # backward
        elif round(action) == 2:
            converted_action = np.array([[0, 0, 1, 0]])  # counterclock
Ejemplo n.º 27
0
# examine the state space
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

agent = Agent(state_size=state_size, action_size=action_size)
agent.qnetwork_local.load_state_dict(
    torch.load(dirpath + "/checkpoint.pth"))
max_t = 1000
for i in range(10):
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]
    score = 0
    for t in range(max_t):
        action = agent.act(state, 0.01)
        # send the action to the environment
        env_info = env.step(action)[brain_name]
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        done = env_info.local_done[0]                  # see if episode has
        score += reward
        state = next_state
        if done:
            break
    print(score)

env.close()


# from unityagents import UnityEnvironment
Ejemplo n.º 28
0
stockData = list(df['收盘'])
l = len(stockData) - 1
window_size = 10
state = getState(stockData, 0, window_size + 1)
# total_profit = 0
agent.inventory = []
action_list = []
pos_list = []
pos_old = 0
total_share = 0
cost = 0
money_initial = 10000
money = money_initial
for t in range(l):
    action = agent.act(state, eps=0, is_eval=True)
    next_state = getState(stockData, t + 1, STATE_SIZE + 1)
    if action == 1:  # 买入
        pos_new = min(pos_old + 0.2, 1)
        total_share += money * (pos_new - pos_old) / stockData[t]
        #agent.inventory.append(stockData[t])
        # print("buy" + str(stockData[t]))
    elif action == 2:
        pos_new = max(pos_old - 0.2, 0)
        total_share += money * (pos_new - pos_old) / stockData[t]
        #bought_price = agent.inventory.pop(0)
        #total_profit += stockData[t] - bought_price
    else:
        pos_new = pos_old
    money = money_calculate(money, total_share, stockData[t], pos_new)
Ejemplo n.º 29
0
    def run(self,
            run_id=1,
            n_episodes=2000,
            max_t=1000,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995,
            lr=5e-4,
            use_double_dqn=False,
            use_soft_update=True):
        start = time.time()

        agent = Agent(state_size=37,
                      action_size=4,
                      seed=0,
                      lr=lr,
                      use_double_dqn=use_double_dqn,
                      use_soft_update=use_soft_update)

        # list containing scores from each episode
        scores = []

        # last 100 scores
        scores_window = deque(maxlen=100)

        # initialize epsilon
        eps = eps_start

        for i_episode in range(1, n_episodes + 1):

            # reset the environment
            env_info = self.env.reset(train_mode=True)[self.brain_name]

            # get the current state
            state = env_info.vector_observations[0]

            score = 0

            for t in range(max_t):

                action = agent.act(state, eps)
                #print("action: ", action)

                # send the action to the environment
                env_info = self.env.step(action)[self.brain_name]

                # get the next state
                next_state = env_info.vector_observations[0]

                # get the reward
                reward = env_info.rewards[0]

                # see if episode has finished
                done = env_info.local_done[0]

                # TODO add proper comment
                agent.step(state, action, reward, next_state, done)

                state = next_state
                score += reward

                if done:
                    break

            # save most recent score
            scores_window.append(score)

            # save most recent score
            scores.append(score)

            # decrease epsilon
            eps = max(eps_end, eps_decay * eps)

            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")

            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))

            if np.mean(scores_window) >= 14.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))

                end = time.time()
                elapsed = end - start
                print("\nTime taken to solve: {:.2f} minutes".format(elapsed /
                                                                     60.0))

                run_dir = "results/{}".format(run_id)
                os.mkdir(run_dir)

                torch.save(agent.qnetwork_local.state_dict(),
                           "{}/checkpoint.pth".format(run_dir))
                break

        return scores
Ejemplo n.º 30
0
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'weights.pth')
            break
    return scores

scores = dqn()


# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()


# load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('weights.pth'))

for i in range(3):
    state = env.reset()
    for j in range(200):
        action = agent.act(state)
        env.render()
        state, reward, done, _ = env.step(action)
        if done:
            break

env.close()
Ejemplo n.º 31
0
def objective(trial):
    file = pd.ExcelFile(r'sasd_a2c.xlsx')
    state_index_oh = file.parse('state_index')
    MaxEpisodes = 2000
    Env = env()
    EPSILON = 1
    Total_Reward = []
    Avg_Rewards = []
    # output1_lst = []
    # output2_lst = []
    # input1_lst = []
    # input2_lst = []

    fc1_dims = trial.suggest_categorical('fc1_dims', [15, 20, 30])
    lr = trial.suggest_uniform("lr", 5e-6, 1e-4)
    gamma = trial.suggest_categorical("gamma", [0.97, 0.98, 0.99])
    lr_ns = trial.suggest_uniform("lr_ns", 1e-4, 1e-2)
    lr_r = trial.suggest_uniform("lr_r", 1e-4, 5e-3)
    lr_d = trial.suggest_uniform("lr_d", 1e-4, 1e-3)

    agent1 = Agent(state_size=9,
                   action_size=10,
                   fc1_dims=fc1_dims,
                   lr=lr,
                   batch_size=64,
                   buffer_size=100000,
                   gamma=gamma,
                   tau=0.002,
                   lr_ns=lr_ns,
                   lr_r=lr_r,
                   lr_d=lr_d)  #fc1=32 lr=0.0009 gamma=0,98
    agent2 = Agent(state_size=9,
                   action_size=10,
                   fc1_dims=fc1_dims,
                   lr=lr,
                   batch_size=64,
                   buffer_size=100000,
                   gamma=gamma,
                   tau=0.002,
                   lr_ns=lr_ns,
                   lr_r=lr_r,
                   lr_d=lr_d)
    writer = SummaryWriter()
    writer.add_graph(
        agent1.q_network,
        torch.from_numpy(state_index_oh.iloc[:, 2:].values).float())
    writer.add_graph(
        agent2.q_network,
        torch.from_numpy(state_index_oh.iloc[:, 2:].values).float())
    writer.close()
    agent1.memory.buffer_reset()
    agent2.memory.buffer_reset()
    for ep in range(MaxEpisodes):
        state = Env.reset()  #torch.zeros(1)
        # agent.memory.buffer_reset()
        done = False
        stepscounter = 0
        ep_reward = 0
        state_OH = state_index_oh.iloc[state.int().numpy(),
                                       2:].values.reshape(-1)

        while not done:
            stepscounter += 1

            action1 = agent1.act(state_OH, EPSILON)
            action = action1 * 10
            new_state, reward, done, obs = Env.next_state(action)
            ep_reward += reward
            new_state_OH = state_index_oh.iloc[new_state.int().numpy(),
                                               2:].values.reshape(-1)
            agent1.memory.store_transition(
                state_OH, action1, 2 * ((reward.item() + 50) / 100) - 1,
                new_state, done)
            state_OH = new_state_OH

            if done == True:
                break
            action2 = agent2.act(state_OH, EPSILON)
            if action2 == action1:
                continue
            action = action2
            new_state, reward, done, obs = Env.next_state(action)
            ep_reward += reward
            new_state_OH = state_index_oh.iloc[new_state.int().numpy(),
                                               2:].values.reshape(-1)
            agent2.memory.store_transition(
                state_OH, action2, 2 * ((reward.item() + 50) / 100) - 1,
                new_state, done)
            state_OH = new_state_OH

            agent1.learn()
            agent2.learn()
            update_model1 = agent1.train_model(1)
            update_model2 = agent2.train_model(2)
            for _ in range(5):
                agent1.sim_learn(1)
                agent2.sim_learn(2)

        output1 = obs[0].item()
        output2 = obs[1].item()
        input1 = obs[2].item()
        input2 = obs[3].item()

        EPSILON = epsilon_decay(eps=EPSILON)
        Total_Reward.append(ep_reward)
        avg_reward = np.mean(Total_Reward[-100:])
        Avg_Rewards.append(avg_reward)

        if ep % 1 == 0:
            totalresult = 'episode: ' + str(
                ep + 1
            ) + '  Total_Reward %.2f' % ep_reward + '  Average_Reward %.2f' % avg_reward + '  Steps ' + str(
                stepscounter) + ' Model Training Data: ' + str(
                    update_model1) + str(
                        update_model2
                    )  #+' Output1: '+str(output1)+' Output2: '+str(output2)
            # dataCollect("Total Result",Total_Result,totalresult,i_episode)
            print(f'\r{totalresult}', end='\r')

        writer.add_scalar('reward/episode', ep_reward, ep)
        writer.add_scalar('Avgreward/episode', avg_reward, ep)
        writer.add_scalar('output1/episode', output1, ep)
        writer.add_scalar('output2/episode', output2, ep)
        writer.add_scalar('input1/episode', input1, ep)
        writer.add_scalar('input2/episode', input2, ep)

        trial.report(avg_reward, ep)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return avg_reward