예제 #1
0
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

        print('Number of agents:', self.agent_size)
        print('Size of each action:', self.action_size)

        num_episodes = 300
        self.rollout_length = 30
        self.agent = ACAgent(self.state_size, 
                        self.action_size,
                        self.agent_size,
                        rollout_length=self.rollout_length,
                        lr=1e-3,
                        lr_decay=.95,
                        gamma=.95,
                        value_loss_weight = 1,
                        gradient_clip = 5,
                        )
        self.total_rewards = []
        self.avg_scores = []
        self.max_avg_score = -1
        self.max_score = -1
        self.worsen_tolerance = 10  # for early-stopping training if consistently worsen for # episodes

        self.model_path = f"./data/brain{self.brain_id}.checkpoint"
        self.data_path = f"./data/data{self.brain_id}"
예제 #2
0
class Train:
    #best_model_path = "./best_model.checkpoint"
    state_size = 0
    i_episode = 0

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

        print('Number of agents:', self.agent_size)
        print('Size of each action:', self.action_size)

        num_episodes = 300
        self.rollout_length = 30
        self.agent = ACAgent(self.state_size, 
                        self.action_size,
                        self.agent_size,
                        rollout_length=self.rollout_length,
                        lr=1e-3,
                        lr_decay=.95,
                        gamma=.95,
                        value_loss_weight = 1,
                        gradient_clip = 5,
                        )
        self.total_rewards = []
        self.avg_scores = []
        self.max_avg_score = -1
        self.max_score = -1
        self.worsen_tolerance = 10  # for early-stopping training if consistently worsen for # episodes

        self.model_path = f"./data/brain{self.brain_id}.checkpoint"
        self.data_path = f"./data/data{self.brain_id}"
        
        #self.agent.load(self.model_path)
        #self.agent.load(self.best_model_path)
        

    def get_action(self, states):
        actions, log_probs, state_values = self.agent.sample_action(states)      # select actions for 20 envs
        return actions, log_probs, state_values

    def before_episodes(self):
        self.experience = []
        self.scores = np.zeros(self.agent_size)                                           # initialize the score
        self.steps_taken = 0

    def train_step(self, states, rewards, not_dones, log_probs, state_values, actions):
        self.steps_taken += 1


        self.experience.append([actions, rewards, log_probs, not_dones, state_values])
        
        if self.steps_taken % self.rollout_length == 0:
            self.agent.update_model(self.experience)
            del self.experience[:]

        self.scores += rewards                                                   # update the scores

    def after_episode(self):

        episode_score = np.mean(self.scores)                                         # compute the mean score for 20 agents

        self.total_rewards.append(episode_score)
        print("Episodic {} Score: {}".format(self.i_episode, episode_score))
        # self.writedata("Episodic {} Score: {}".format(self.i_episode, episode_score))
        self.writedata(f"{episode_score}")
        if self.max_score < episode_score:                                           # saving new best model
            self.max_score = episode_score
            self.agent.save(self.model_path)

        if len(self.total_rewards) >= 100:                       # record avg score for the latest 100 steps
            latest_avg_score = sum(self.total_rewards[(len(self.total_rewards)-100):]) / 100
            print("100 Episodic Average Score: {}".format(latest_avg_score))
            # self.writedata("100 Episodic Average Score: {}".format(latest_avg_score))
            self.avg_scores.append(latest_avg_score)

            if self.max_avg_score <= latest_avg_score:           # record better results
                self.worsen_tolerance = 10                       # re-count tolerance
                self.max_avg_score = latest_avg_score

            else:                                           
                self.worsen_tolerance -= 1                       # count tolerance
                if self.max_avg_score > 10:                      # continue from last best-model
                    print("Loaded from last best model.")
                    # self.writedata("Loaded from last best model.")
                    self.agent.load(self.model_path)

        self.i_episode += 1

    def writedata(self, string):
        with open(self.data_path, 'a') as f:
            f.write(string + "\n")
                  
예제 #3
0
파일: train.py 프로젝트: qiaochen/A2C
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # examine the state space 
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0])

    num_episodes = 300
    rollout_length = 5
    agent = ACAgent(state_size, 
                    action_size,
                    num_agents,
                    rollout_length=rollout_length,
                    lr=1e-4,
                    lr_decay=.95,
                    gamma=.95,
                    value_loss_weight = 1,
                    gradient_clip = 5,
                    )
    total_rewards = []
    avg_scores = []
    max_avg_score = -1
    max_score = -1
    worsen_tolerance = 10  # for early-stopping training if consistently worsen for # episodes
    rollout = []
    for i_episode in range(1, num_episodes+1):
        env_inst = env.reset(train_mode=True)[brain_name]                       # reset the environment
        states = env_inst.vector_observations                                   # get the current state
        scores = np.zeros(num_agents)                                           # initialize the score
        dones = [False]*num_agents
from agent import ACAgent, ACArgs
from trainer import DistributedTrainer
from numeric_env import MultiEnv

import torch
torch.set_num_threads(1)

env = MultiEnv(2, 2)
args = ACArgs(state_dim=env.STATE_DIM, action_dim=2)

agents = [ACAgent(args) for _ in range(2)]
trainer = DistributedTrainer(agents,
                             env,
                             parameter_share=False,
                             log_dir='../logs/ac_d')
trainer.train(1000000)
예제 #5
0
from agent import ACAgent, ACArgs
from trainer import CentralizedTrainer
from numeric_env import MultiEnv

import torch
torch.set_num_threads(1)

env = MultiEnv(2, 2)
args = ACArgs(state_dim=env.STATE_DIM, action_dim=4)

agent = ACAgent(args)
trainer = CentralizedTrainer(agent, env, log_dir='../logs/ac_c')
trainer.train(1000000)
예제 #6
0
파일: test.py 프로젝트: qiaochen/A2C
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # dim of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)

    # dim of the state space
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = ACAgent(state_size, action_size, num_agents)

    agent.load(best_model_path)

    test_scores = []
    for i_episode in tqdm(range(1, 101)):
        scores = np.zeros(num_agents)  # initialize the scores
        env_info = env.reset(
            train_mode=True)[brain_name]  # reset the environment
        states = env_info.vector_observations  # get the current states
        dones = [False] * num_agents
        while not np.any(dones):
            actions = agent.act(states)  # select actions
            actions = actions.detach().cpu().numpy()
            env_info = env.step(actions)[
                brain_name]  # send the actions to the environment