Exemple #1
0
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']
        agent_params['state_size'] = self.env.observation_space.shape[0]
        agent_params['action_size'] = self.env.action_space_size
        self.agent = AgentPPO(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        print("PPO agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))
Exemple #2
0
def watch_agent(env_name, agent_ckpt, steps):
    device = torch.device(DEVICE)

    if env_name == 'reacher':
        env = UnityEnv(env_file='data/Reacher.exe', no_graphics=False)
        policy = ReacherActorCritic(env.state_size, env.action_size).to(device)
    else:
        env = UnityEnv(env_file='data/Crawler/Crawler_Windows_x86_64.exe',
                       no_graphics=False,
                       mlagents=True)
        policy = CrawlerActorCritic(env.state_size, env.action_size).to(device)

    checkpoint = torch.load(agent_ckpt, map_location=DEVICE)
    policy.load_state_dict(checkpoint)

    running_rewards = np.zeros(env.num_agents)
    scores = np.zeros(env.num_agents)
    state = env.reset(train=False)
    for step_i in range(steps):
        action, _, _, _ = policy(torch.from_numpy(state).float().to(device))
        state, r, done = env.step(action.detach().cpu().numpy())
        running_rewards += r

        # check if agent is done
        agents_are_done = True
        for i in range(env.num_agents):
            if done[i] and scores[i] == 0:
                scores[i] = running_rewards[i]
            if scores[i] == 0:
                agents_are_done = False
        if agents_are_done:
            break

    env.close()
    print(f'Average score of 20 agents is: {np.mean(scores):.2f}')
Exemple #3
0
def main(path='model_checkpoints'):
    # seed = 1234
    ### For unity ###
    env = UnityEnv(env_file='Environments/Reacher_Linux_20/Reacher.x86_64',
                   no_graphics=False)

    # number of agents
    num_agents = env.num_agents
    print('Number of agents:', num_agents)

    # size of each action
    action_size = env.action_size

    # examine the state space
    state_size = env.state_size
    print('Size of each action: {}, Size of the state space {}'.format(
        action_size, state_size))

    ### For gym ###
    # K = 2
    # env = gym.make('MountainCarContinuous-v0')
    # nS = env.observation_space.shape[0]
    # nA = env.action_space.shape[0]
    # K_envs = MultiEnv(env,nS,K)
    ddpg_config = Config('maddpg')
    maddpg = MultiAgent(env, state_size, action_size, ddpg_config)
    maddpg.load_weights(ddpg_config.critic_path, ddpg_config.actor_path)
    maddpg.evaluate()
Exemple #4
0
def main(algo):
    seed = 7

    # Load the ENV
    ### For running in VSCode ###
    # env = UnityEnv(env_file='Environments/Tennis_Linux/Tennis.x86_64',no_graphics=True)
    ### For running from terminal ###
    env = UnityEnv(env_file='../Environments/Tennis_Linux/Tennis.x86_64',
                   no_graphics=True)

    # number of agents
    num_agents = env.num_agents
    print('Number of agents:', num_agents)

    # size of each action
    action_size = env.action_size

    # examine the state space
    state_size = env.state_size
    print('Size of each action: {}, Size of the state space {}'.format(
        action_size, state_size))

    ddpg_config = Config(algo)

    maddpg = MultiAgent(env, state_size, action_size, ddpg_config)
    maddpg.seed_replay_buffer()
    maddpg.train()
def main(path='model_checkpoints'):
    seed = 1234
    env = UnityEnv(env_file='Environments/Reacher_Linux_20/Reacher.x86_64',
                   no_graphics=False)

    # number of agents
    num_agents = env.num_agents
    print('Number of agents:', num_agents)

    # size of each action
    action_size = env.action_size

    # examine the state space
    state_size = env.state_size
    print('Size of each action: {}, Size of the state space {}'.format(
        action_size, state_size))

    path = 'model_checkpoints/ppo.ckpt'
    agent = PPO(env, action_size, state_size, seed)
    agent.load_weights(path)
    rewards = []

    state = env.reset()
    for i in range(4000):
        action, _, _, _ = agent.policy(state)
        next_state, reward, done = env.step(action.cpu().numpy())
        # print(next_state,reward,done)
        state = next_state
        rewards.append(np.sum(rewards))
    env.close()
    print("The agent achieved an average score of {:.2f}".format(
        np.mean(rewards)))
def main(path='model_checkpoints'):
    seed = 1234
    env = UnityEnv(env_file='../Environments/Tennis_Linux/Tennis.x86_64',
                   no_graphics=False)

    # number of agents
    num_agents = env.num_agents
    print('Number of agents:', num_agents)

    # size of each action
    action_size = env.action_size

    # examine the state space
    state_size = env.state_size
    print('Size of each action: {}, Size of the state space {}'.format(
        action_size, state_size))

    config = Config('ddpg')
    path = '/home/shuza/Code/Udacity_multiplayer/DDPG/model_weights/ddpg.ckpt'
    agent = Agent(state_size * 2, action_size * 2, Actor, Critic, config)
    agent.load_weights(path)
    rewards = []

    state = env.reset()
    for i in range(4000):
        action = agent.evaluate(state.reshape(-1))
        next_state, reward, done = env.step(action.reshape(2, -1))
        # print(next_state,reward,done)
        state = next_state
        rewards.append(np.sum(rewards))
        if done.any():
            break
    env.close()
    print("The agent achieved an average score of {:.2f}".format(
        np.mean(rewards)))
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']

        self.__num_of_agents = self.env.observation_space.shape[0]
        state_size = self.env.observation_space.shape[1]
        action_size = self.env.action_space_size
        agent_params['num_of_agents'] = self.__num_of_agents
        agent_params['state_size'] = state_size
        agent_params['action_size'] = action_size
        self.agents = Agents(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        self.exploration_noise = UOProcess()

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        self.sigma = 0.5

        print("MADDPG agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))
def watch_rnd_game(steps):
    env = UnityEnv(env_file='data/Crawler/Crawler_Windows_x86_64.exe',
                   no_graphics=False,
                   mlagents=True)
    env.reset(train=False)

    rewards = np.zeros(env.num_agents)
    for i in range(steps):
        action = np.random.rand(env.num_agents, env.action_size)
        _, r, done = env.step(action)
        rewards += r
        if done.all():
            break
    print(f'Average score of 20 agents is: {np.mean(rewards):.2f}')
    env.close()
def main(algo):
    seed = 7
    path = 'model_checkpoints/ppo.ckpt'

    # Load the ENV
    # env = UnityEnv(env_file='Environments/Reacher_Linux_one/Reacher.x86_64',no_graphics=True)
    env = UnityEnv(env_file='Environments/Tennis_Linux/Tennis.x86_64',
                   no_graphics=True)

    # number of agents
    num_agents = env.num_agents
    print('Number of agents:', num_agents)

    # size of each action
    action_size = env.action_size

    # examine the state space
    state_size = env.state_size
    print('Size of each action: {}, Size of the state space {}'.format(
        action_size, state_size))

    config = Config(algo)

    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        device2 = torch.device("cuda:1")
    agent = PPO(action_size, state_size, seed, device, config)
    #     try:
    #     except:
    #         device = torch.device("cuda:0")
    # else:
    #     device = torch.device('cpu')
    # try:
    #     agent_a = PPO(action_size,state_size,seed,device,config)
    #     agent_b = PPO(action_size,state_size,seed,device2,config)
    #     print('Double GPU')
    # except:
    #     print('Single GPU')
    #     agent_a = PPO(action_size,state_size,seed,device,config)
    #     agent_b = PPO(action_size,state_size,seed,device,config)

    train_ppo(env, agent, EPISODES, path)
        clip = random.choice(clips)
        nstep = random.choice(nsteps)
        epoch = random.choice(epochs)
        gae_tau = random.choice(gae_taus)
        weight_decay = random.choice(weight_decays)
        lrate_decay = random.choice(lrate_decays)
        lrate_schedule = lambda it: lrate_decay**it

        summary = f'nbatch_{nbatch:d}_lrate_{lrate:.0E}_clip_{clip:.2f}'
        summary += f'_nstep_{nstep:d}_epoch_{epoch:d}_gae_{gae_tau:.2f}'
        summary += f'_lrdecay_{lrate_decay}_wdcay_{weight_decay}'

        writer = SummaryWriter(os.path.join(root_logdir, summary))

        # create new environment
        env = UnityEnv(env_file='data/Crawler/Crawler_Windows_x86_64.exe',
                       mlagents=True)

        # create new policy
        policy = CrawlerActorCritic(env.state_size, env.action_size).to(device)

        # create agent
        a = Agent(env,
                  policy,
                  nsteps=nstep,
                  gamma=gamma,
                  epochs=epoch,
                  nbatchs=nbatch,
                  ratio_clip=clip,
                  lrate=lrate,
                  gradient_clip=gradient_clip,
                  beta=beta,
Exemple #11
0
    episodes = 2000  # total number of episodes to run
    steps = 2000  # maximum number of steps per episode
    upd_every = 1  # update agents every # of steps
    batch_size = 128

    expl_theta = 0.15
    expl_sigma = 0.2

    lrate_actor = 1e-3
    lrate_critic = 1e-3

    tau = 0.02

    # environment
    env = UnityEnv()

    agent = Agent(env.state_size * 2,
                  env.action_size,
                  Actor,
                  Critic,
                  exploration_sigma=expl_sigma,
                  exploration_theta=expl_theta,
                  lrate_actor=lrate_actor,
                  lrate_critic=lrate_critic,
                  update_every=upd_every,
                  batch_size=batch_size,
                  tau=tau)

    # logging
    scores = deque(maxlen=100)
Exemple #12
0
            writer.add_scalar('data/score', mean, ep_i)
            if mean > 0.50 and mean > last_saved:
                summary += " (saved)"
                last_saved = mean
                agent.save('saved_models/tennis_ddpg.ckpt')

        print(summary)


if __name__ == '__main__':
    # hyperparameters
    episodes = 2000
    steps = 2000

    # environment
    env = UnityEnv(no_graphics=False)
    state_size = env.state_size * 2
    action_size = env.action_size * 2

    # agent
    agent = Agent(state_size,
                  action_size,
                  Actor,
                  Critic,
                  lrate_critic=1e-3,
                  lrate_actor=1e-4,
                  tau=0.01,
                  buffer_size=1e6,
                  batch_size=256,
                  gamma=0.99,
                  exploration_mu=0.0,
Exemple #13
0
class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']
        agent_params['state_size'] = self.env.observation_space.shape[0]
        agent_params['action_size'] = self.env.action_space.n
        self.agent = Agent(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.max_eps = trainer_params['max_eps']
        self.final_eps = trainer_params['final_eps']
        self.eps_decay = trainer_params['eps_decay']
        self.b_decay = trainer_params['b_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        reward_window = deque(maxlen=100)

        self.eps_decay = (self.final_eps /
                          self.max_eps)**(1 / (0.2 * num_of_episodes))

        reward_matrix = np.zeros((num_of_episodes, 300))

        for episode_i in range(1, num_of_episodes):

            state = self.env.reset()
            done = False
            total_reward = 0
            total_loss = 0

            #self.agent.eps = self.max_eps/(episode_i + 1)
            self.agent.eps *= self.eps_decay

            #self.agent.b = 1 - np.exp(-self.b_decay * episode_i)

            counter = 0
            while not done:
                action = self.agent.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done)
                state = next_state

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}, actions: {}, fc1 weight data: {}".
                #              format(episode_i, reward, counter, action, actions,
                #                     self.agent.get_qlocal().fc1.weight.data))

                total_loss += self.agent.agent_loss
                total_reward += reward
                reward_matrix[episode_i, counter] = reward
                counter += 1

            reward_window.append(total_reward)

            print(
                '\rEpisode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f} '
                '\t\tTotal loss: {:.2f}\tEpsilon: {:.2f}\tBeta: {:.2f}\tLearning rate: {:.4f}'
                .format(episode_i, total_reward, np.mean(reward_window),
                        total_loss, self.agent.eps, self.agent.b,
                        self.agent.learning_rate),
                end="")

            logging.info(
                'Episode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f} '
                '\t\tTotal loss: {:.2f}\tEpsilon: {:.2f}\tBeta: {:.2f}\tLearning rate: {:.4f}'
                .format(episode_i, total_reward, np.mean(reward_window),
                        total_loss, self.agent.eps, self.agent.b,
                        self.agent.learning_rate))

            self.agent.learning_rate *= self.learning_rate_decay
            self.agent.set_learning_rate(self.agent.learning_rate)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))
                self.avg_rewards.append(avg_reward)

                if avg_reward >= 13.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    torch.save(
                        self.agent.get_qlocal().state_dict(), self.model_path +
                        'checkpoint_{}.pth'.format(datetime.datetime.today().
                                                   strftime('%Y-%m-%d_%H-%M')))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        reward_matrix.dump(self.results_path +
                           'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_new_{}.dat'.format(t))

    def test(self, checkpoint_filename, time_span=10):

        checkpoint_path = self.model_path + checkpoint_filename
        self.agent.get_qlocal().load_state_dict(torch.load(checkpoint_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            done = False

            while not done:
                action = self.agent.choose_action(state, 'test')
                sys.stdout.flush()
                self.env.render()
                state, reward, done, _ = self.env.step(action)
                self.score += reward

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)
Exemple #14
0
def make_env(file_name, wrapped=False):
    if wrapped:
        env = UnityEnv(environment_filename=file_name)
    else:
        env = UnityEnvironment(file_name=file_name)
    return env
Exemple #15
0
class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']
        agent_params['state_size'] = self.env.observation_space.shape[0]
        agent_params['action_size'] = self.env.action_space_size
        self.agent = AgentPPO(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        print("PPO agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        logging.info("Training:")
        reward_window = deque(maxlen=100)
        # reward_matrix = np.zeros((num_of_episodes, 300))

        for episode_i in range(1, num_of_episodes):

            state = self.env.reset()
            total_reward = 0
            total_loss = 0

            counter = 0
            total_action_mean = 0
            total_action_std = 0

            for t in range(self.t_max):
                action, log_probs, mean, std = self.agent.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done,
                                log_probs)
                state = next_state

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}".
                #              format(episode_i, reward, counter, action))

                total_loss += self.agent.agent_loss
                total_reward += np.array(reward)

                counter += 1
                total_action_mean = total_action_mean * (
                    counter - 1) / counter + np.mean(mean) / counter
                total_action_std = total_action_std * (
                    counter - 1) / counter + np.mean(std) / counter

            reward_window.append(total_reward)
            self.avg_rewards.append(np.mean(total_reward))
            print(
                '\rEpisode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f}\tMean: {:.2f} \tStd {:.2f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.mean(total_reward),
                        np.mean(reward_window), total_action_mean,
                        total_action_std, total_loss,
                        self.agent.learning_rate_policy,
                        self.agent.learning_rate_value_fn),
                end="")

            # logging.info('Episode {}\tCurrent Score (average over 20 robots): {:.2f}\tAverage Score (over episodes): {:.2f} '
            #              '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'.
            #              format(episode_i, np.mean(total_reward), np.mean(reward_window),
            #                     total_loss, self.agent.learning_rate_policy, self.agent.learning_rate_value_fn))

            self.agent.learning_rate_policy *= self.learning_rate_decay
            self.agent.learning_rate_value_fn *= self.learning_rate_decay
            self.agent.set_learning_rate(self.agent.learning_rate_policy,
                                         self.agent.learning_rate_value_fn)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))

                if avg_reward >= 30.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    if not os.path.exists(self.model_path):
                        os.makedirs(self.model_path)
                    torch.save(
                        self.agent.get_actor().state_dict(),
                        self.model_path + 'checkpoint_actor_{}.pth'.format(
                            datetime.datetime.today().strftime(
                                '%Y-%m-%d_%H-%M')))
                    torch.save(
                        self.agent.get_critic().state_dict(),
                        self.model_path + 'checkpoint_critic_{}.pth'.format(
                            datetime.datetime.today().strftime(
                                '%Y-%m-%d_%H-%M')))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_new_{}.dat'.format(t))

    def test(self,
             checkpoint_actor_filename,
             checkpoint_critic_filename,
             time_span=10):
        checkpoint_actor_path = self.model_path + checkpoint_actor_filename
        checkpoint_critic_path = self.model_path + checkpoint_critic_filename
        self.agent.get_actor().load_state_dict(
            torch.load(checkpoint_actor_path))
        self.agent.get_critic().load_state_dict(
            torch.load(checkpoint_critic_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            #done = False

            while True:
                action = self.agent.choose_action(state, 'test')
                sys.stdout.flush()
                self.env.render()
                state, reward, done, _ = self.env.step(action)
                self.score += np.array(reward)
                if any(done):
                    break

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)
    gamma = 0.99
    timesteps = 100
    ratio_clip = 0.2
    batch_size = int(32 * 20)
    epochs = 10
    gradient_clip = 10.0
    lrate = 1e-4
    log_each = 10
    beta = 0.01
    gae_tau = 0.95
    decay_steps = None
    solved = 30.0
    out_file = 'saved_models/ppo.ckpt'

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    env = UnityEnv(env_file='data/Reacher/Reacher.exe')
    policy = ReacherActorCritic(env.state_size, env.action_size).to(device)
    a = Agent(env,
              policy,
              timesteps=timesteps,
              gamma=gamma,
              epochs=epochs,
              batch_size=batch_size,
              ratio_clip=ratio_clip,
              lrate=lrate,
              gradient_clip=gradient_clip,
              beta=beta,
              gae_tau=gae_tau)

    train(a,
          iterations=iterations,
class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']

        self.__num_of_agents = self.env.observation_space.shape[0]
        state_size = self.env.observation_space.shape[1]
        action_size = self.env.action_space_size
        agent_params['num_of_agents'] = self.__num_of_agents
        agent_params['state_size'] = state_size
        agent_params['action_size'] = action_size
        self.agents = Agents(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        self.exploration_noise = UOProcess()

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        self.sigma = 0.5

        print("MADDPG agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        logging.info("Training:")
        reward_window = deque(maxlen=100)

        for episode_i in range(1, num_of_episodes):

            states = self.env.reset()
            self.agents.reset(self.sigma)
            scores = np.zeros(self.env.observation_space.shape[0])
            total_loss = 0

            self.sigma *= 0.99

            counter = 0
            for t in range(self.t_max):

                actions = self.agents.choose_action(states)
                next_states, rewards, dones, _ = self.env.step(actions)
                self.agents.step(states, actions, rewards, next_states, dones)
                states = next_states

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}".
                #              format(episode_i, reward, counter, action))

                total_loss += self.agents.agent_loss
                scores += rewards
                counter += 1
                if any(dones):
                    break

            reward_window.append(np.max(scores))
            self.avg_rewards.append(np.mean(np.array(reward_window)))
            print(
                '\rEpisode {}\tCurrent Score: {:.4f}\tAverage Score: {:.4f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.max(scores), np.mean(reward_window),
                        total_loss, self.agents.learning_rate_actor,
                        self.agents.learning_rate_critic),
                end="")

            logging.info(
                'Episode {}\tCurrent Score: {:.4f}\tAverage Score (over episodes): {:.4f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actors): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.max(scores), np.mean(reward_window),
                        total_loss, self.agents.learning_rate_actor,
                        self.agents.learning_rate_critic))

            self.agents.learning_rate_actor *= self.learning_rate_decay
            self.agents.learning_rate_critic *= self.learning_rate_decay
            self.agents.set_learning_rate(self.agents.learning_rate_actor,
                                          self.agents.learning_rate_critic)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))

                if avg_reward >= 1.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    if not os.path.exists(self.model_path):
                        os.makedirs(self.model_path)

                    t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
                    torch.save(
                        self.agents.get_actor()[0].state_dict(),
                        self.model_path + 'checkpoint_actor1_{}.pth'.format(t))
                    torch.save(
                        self.agents.get_actor()[1].state_dict(),
                        self.model_path + 'checkpoint_actor2_{}.pth'.format(t))
                    torch.save(
                        self.agents.get_critic().state_dict(),
                        self.model_path + 'checkpoint_critic_{}.pth'.format(t))
                    np.array(self.avg_rewards).dump(
                        self.results_path + 'average_rewards_{}.dat'.format(t))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_{}.dat'.format(t))

    def test(self,
             checkpoint_actor1_filename,
             checkpoint_actor2_filename,
             checkpoint_critic_filename,
             time_span=10):
        checkpoint_actor1_path = self.model_path + checkpoint_actor1_filename
        checkpoint_actor2_path = self.model_path + checkpoint_actor2_filename
        checkpoint_critic_path = self.model_path + checkpoint_critic_filename
        self.agents.get_actor()[0].load_state_dict(
            torch.load(checkpoint_actor1_path))
        self.agents.get_actor()[1].load_state_dict(
            torch.load(checkpoint_actor2_path))
        self.agents.get_critic().load_state_dict(
            torch.load(checkpoint_critic_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            #done = False

            while True:
                action = self.agents.choose_action(state, 'test')
                state, reward, done, _ = self.env.step(action)
                self.score += np.array(np.max(reward))
                if any(done):
                    break

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)
            next_state, reward, done = env.step(action.reshape(2, -1))
            score += reward
            state = next_state
            if done.any():
                break
        scores.append(np.max(score))
    return scores


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--agent',
                        '-a',
                        default='saved_models/tennis_ddpg.ckpt')
    args = parser.parse_args()

    # create environment
    env = UnityEnv(no_graphics=False)
    state_size = env.state_size * 2
    action_size = env.action_size * 2

    # restore agent checkpoint
    agent = Agent(state_size, action_size, Actor, Critic, restore=args.agent)

    # watch agent
    scores = watch(env, agent, 10)

    print(f'Average score over 10 episodes: {np.mean(scores):.2f}')

    env.close()
#           'worker_id' : 0,
#           'seed' : np.random.randint(1000),
#           'visual_mode' : False,
#           'multiagent_mode' : True}
# env_name = 'Reacher'
# env = UnityEnv(env_params)
#
env_params = {
    'path': '../Reacher_Linux/Reacher.x86_64',
    'worker_id': 0,
    'seed': np.random.randint(1000),
    'visual_mode': False,
    'multiagent_mode': False
}
env_name = 'Reacher'
env = UnityEnv(env_params)

# env_name = 'MountainCarContinuous-v0'
# env = gym.make(env_name) #Pendulum-v0 #MountainCarContinuous-v0 #LunarLanderContinuous-v2
try:
    observation = env.reset(train_mode=False)
except:
    observation = env.reset()
# observation = env.reset()
action_space = env.action_space
observation_space = env.observation_space
params = dict()
params['action_dim'] = len(env.action_space.low)
params['state_dim'] = len(observation_space.low)
params['num_episodes'] = 200
params['buffer_size'] = int(1e6)  # replay buffer size