Exemple #1
0
def test(rank, params, shared_p):
    torch.manual_seed(params.seed + rank)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    policy = Policy(num_inputs, num_outputs)

    state = env.reset()
    state = Variable(torch.Tensor(state).unsqueeze(0))
    reward_sum = 0
    done = True

    start_time = time.time()

    episode_length = 0
    while True:
        episode_length += 1
        policy.load_state_dict(shared_p.state_dict())
        mu, _ = policy(state)
        action = mu.data
        env_action = action.squeeze().numpy()
        state, reward, done, _ = env.step(env_action)
        reward_sum += reward

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            state = env.reset()
            time.sleep(60)

        state = Variable(torch.Tensor(state).unsqueeze(0))
Exemple #2
0
def load_models(args, state_size, n_drones, action_size):
    nets = []
    for i in range(n_drones):
        model = Policy(state_size,
                       n_drones,
                       action_size,
                       policy_type=args.policy)
        model.load_state_dict(
            torch.load(
                f"A2C_models/{args.policy}_policy/A2C_drone_{icm_model_name}{i}.bin"
            ))
        nets.append(model)
    return nets
Exemple #3
0
def train(rank, params, shared_p, shared_v, optimizer_p, optimizer_v):
    torch.manual_seed(params.seed + rank)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    policy = Policy(num_inputs, num_outputs)
    value = Value(num_inputs)

    memory = ReplayMemory(1e6)
    batch_size = 10000

    state = env.reset()
    state = Variable(torch.Tensor(state).unsqueeze(0))
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        policy.load_state_dict(shared_p.state_dict())
        value.load_state_dict(shared_v.state_dict())

        w = -1
        while w < batch_size:
            states = []
            actions = []
            rewards = []
            values = []
            returns = []
            advantages = []

            # Perform K steps
            for step in range(params.num_steps):
                w += 1
                states.append(state)

                mu, sigma_sq = policy(state)
                eps = torch.randn(mu.size())
                action = (mu + sigma_sq.sqrt()*Variable(eps))
                actions.append(action)

                v = value(state)
                values.append(v)

                env_action = action.data.squeeze().numpy()
                state, reward, done, _ = env.step(env_action)
                done = (done or episode_length >= params.max_episode_length)
                reward = max(min(reward, 1), -1)
                rewards.append(reward)

                if done:
                    episode_length = 0
                    state = env.reset()

                state = Variable(torch.Tensor(state).unsqueeze(0))

                if done:
                    break

            R = torch.zeros(1, 1)
            if not done:
                v = value(state)
                R = v.data

            # compute returns and advantages:
            values.append(Variable(R))
            R = Variable(R)
            for i in reversed(range(len(rewards))):
                R = params.gamma * R + rewards[i]
                returns.insert(0, R)
                A = R - values[i]
                advantages.insert(0, A)

            # store usefull info:
            memory.push([states, actions, returns, advantages])

        batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(batch_size)

        # policy grad updates:
        mu_old, sigma_sq_old = policy(batch_states)
        probs_old = normal(batch_actions, mu_old, sigma_sq_old)
        policy_new = Policy(num_inputs, num_outputs)
        kl = 0.
        kl_coef = 1.
        kl_target = Variable(torch.Tensor([params.kl_target]))
        for m in range(100):
            policy_new.load_state_dict(shared_p.state_dict())
            mu_new, sigma_sq_new = policy_new(batch_states)
            probs_new = normal(batch_actions, mu_new, sigma_sq_new)
            policy_loss = torch.mean(batch_advantages * torch.sum(probs_new/probs_old,1))
            kl = torch.mean(probs_old * torch.log(probs_old/probs_new))
            kl_loss = kl_coef * kl + \
                params.ksi * torch.clamp(kl-2*kl_target, max=0)**2
            total_policy_loss = - policy_loss + kl_loss
            if kl > 4*kl_target:
                break
            # assynchronous update:
            optimizer_p.zero_grad()
            total_policy_loss.backward()
            ensure_shared_grads(policy_new, shared_p)
            optimizer_p.step()

        # value grad updates:
        for b in range(100):
            value.load_state_dict(shared_v.state_dict())
            v = value(batch_states)
            value_loss = torch.mean((batch_returns - v)**2)
            # assynchronous update:
            optimizer_v.zero_grad()
            value_loss.backward()
            ensure_shared_grads(value, shared_v)
            optimizer_v.step()

        if kl > params.beta_hight*kl_target:
            kl_coef *= params.alpha
        if kl < params.beta_low*kl_target:
            kl_coef /= params.alpha

        print("update done !")
Exemple #4
0
def main():
    # Training settings
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--act-every',
                        type=int,
                        default=3,
                        help='act every N frames (default: 3)')
    parser.add_argument('--duration',
                        type=int,
                        default=1800,
                        help='duration in frames (default: 1800 = 30 seconds)')
    parser.add_argument('--dummy',
                        action='store_true',
                        default=False,
                        help='Dummy env')
    parser.add_argument('--human',
                        action='store_true',
                        default=False,
                        help='P2 is human')
    parser.add_argument('--load-model',
                        type=str,
                        default='results/latest/model.pth')
    args = parser.parse_args()

    args.max_rollout_length = args.duration

    if args.human:
        options["player2"] = "human"

    env = Env(args, 'cpu', options=options, dummy=args.dummy)
    observation_dim = env.observation_dim
    action_dim = env.action_dim

    print(action_dim)

    actor = Policy(action_dim)
    actor_state_dict = torch.load(args.load_model, map_location='cpu')
    actor.load_state_dict(actor_state_dict, strict=False)

    obs = env.reset()
    t = 0
    r = 0
    # actions = [0,0]
    actions = 0
    with torch.no_grad():
        while True:
            try:
                action = actor.act(obs)
                action = action[0].cpu().numpy()
                obs, reward, done = env.step(action)
                if done:
                    env.close()
                    break

                if t >= args.duration:
                    env.close()
                    break

            except Exception as e:
                print(e)
                env.close()
                raise
                break
            t += 1
Exemple #5
0
class Actor(object):
    def __init__(self,
                 args,
                 rollout_queue,
                 shared_state_dict,
                 actor_name=None,
                 rank=0):
        self.args = args
        self.rollout_queue = rollout_queue
        self.actor_name = actor_name
        self.rank = rank
        # self.device = 'cpu'  # args.device
        self.device = args.device
        self.env = None
        self.policy = None
        self.memory = None
        self.shared_state_dict = shared_state_dict

    def initialize(self):
        print('Build Environment for {}'.format(self.actor_name))
        if self.env is None:
            self.env = Env(self.args,
                           self.device,
                           options=self.args.options,
                           dummy=self.args.dummy,
                           rank=self.rank)
        self.policy = Policy(self.env.action_dim).to(self.device)
        self.memory = Memory()

    def performing(self):
        torch.manual_seed(self.args.seed + self.rank)
        self.initialize()
        obs = self.env.reset()
        with torch.no_grad():
            while True:
                self.policy.load_state_dict(
                    self.shared_state_dict.state_dict())
                try:
                    self.policy.reset_rnn()
                    obs = self.env.reset()
                except:
                    obs = obs[-1:]
                    print(obs.shape)
                self.memory.observations.append(obs)
                # print(obs.shape)
                for step in range(self.args.num_steps):
                    action, action_log_prob = self.policy(obs)
                    self.memory.actions.append(action)
                    self.memory.actions_log_probs.append(action_log_prob)

                    send_action = action[-1].cpu().numpy()
                    obs, reward, done = self.env.step(send_action)
                    self.memory.observations.append(obs)
                    self.memory.rewards.append(
                        torch.from_numpy(reward.astype(np.float32)))

                    # print("actor", obs.shape, action.shape, action_log_prob.shape, reward.shape)
                action, action_log_prob = self.policy(obs)
                self.memory.actions.append(action[0:-1])
                self.memory.actions_log_probs.append(action_log_prob[0:-1])

                # print(self.rollout_queue.qsize())
                self.rollout_queue.put(self.memory.get_batch())