Esempio n. 1
0
    def __init__(self,config,dev):

        
        self.dev = dev
        self.num_env =config['num_envs']
        self.get_img_from_render = config['get_img_from_render']
        
        
        
        self.obs_shape = (self.num_env,)+config['obs_space'][1:]
        self.reward_shape = (self.num_env,)+config['reward_space'][1:]
        self.gamma_shape = (self.num_env,)+config['gamma_space'][1:]
        
        
        
        if self.num_env == 1:
            self.env = gym.make(config['game_name'])
        else:
            def make_env():
                def _thunk():
                    env = gym.make(config['game_name'])
                    return env
                return _thunk
            envs = [make_env() for i in range(self.num_env)]
            self.env = SubprocVecEnv(envs)
    def __init__(self, num_env_workers, make_env_func, agent, batch_size,
                 rollout_length, num_recurrence_steps, state_shape,
                 action_shape, stats):
        ''' -one agent is assigned to a collector. 
            -a collector runs a bunch of envs in paralel to feed to that agent
            -you could run a bunch of collectors simultaniously, 
                |-  and then use weight mixing on the agents seperately
        '''
        self.num_env_workers = num_env_workers
        self.envs = SubprocVecEnv(
            [make_env_func() for i in range(num_env_workers)])
        self.agent = agent
        self.batch_size = batch_size
        self.rollout_length = rollout_length
        self.num_recurrence_steps = num_recurrence_steps
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.stats = stats

        self.buffer_full = False
        self.GAE_calculated = False

        self.gamma = 0.8
        self.tau = 0.8

        self.rollout_indices = np.zeros(batch_size)
        self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1
        self.states = torch.zeros(
            (batch_size, self.buffer_width + 1, *state_shape),
            dtype=torch.float32).to(self.agent.device)
        self.actions = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.log_probs = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.values = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                  dtype=torch.float32).to(self.agent.device)
        self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)
        self.done_masks = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.advantages = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)

        self.state = self.envs.reset()
        self.hidden_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)
        self.cell_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)
Esempio n. 3
0
def gen_multi_envs(n_envs, policy):
    def make_env():
        def _thunk():
            env = gen_env(policy)
            return env

        return _thunk

    envs = [make_env() for i in range(n_envs)]
    envs = SubprocVecEnv(envs)
    return envs
def main():

    pixels = (
        (0.0, 1.0, 1.0),
        (0.0, 1.0, 0.0),
        (0.0, 0.0, 1.0),
        (1.0, 1.0, 1.0),
        (1.0, 1.0, 0.0),
        (0.0, 0.0, 0.0),
        (1.0, 0.0, 0.0),
    )
    pixel_to_categorical = {pix: i for i, pix in enumerate(pixels)}
    num_pixels = len(pixels)

    #For each mode in MiniPacman there are different rewards
    mode_rewards = {
        "regular": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        "avoid": [0.1, -0.1, -5, -10, -20],
        "hunt": [0, 1, 10, -20],
        "ambush": [0, -0.1, 10, -20],
        "rush": [0, -0.1, 9.9]
    }
    reward_to_categorical = {
        mode: {reward: i
               for i, reward in enumerate(mode_rewards[mode])}
        for mode in mode_rewards.keys()
    }

    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, num_pixels,
                         len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())
Esempio n. 5
0
File: ppo.py Progetto: CAiM-lab/PPO
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)
Esempio n. 6
0
def main():
    num_envs = 16
    env_name = "CartPole-v0"

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    env = gym.make("CartPole-v0")

    STATE_SIZE = env.observation_space.shape[0]
    N_ACTIONS = env.action_space.n

    agent = Agent(STATE_SIZE, N_ACTIONS)

    trainer = Trainer(envs, agent, lr=3e-4)
    trainer.train(epochs=10000, max_steps=5, test_every=50)
def main():
    num_envs = 16
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    env = gym.make("CartPole-v0")

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    # Hyper params:
    hidden_size = 256
    lr = 3e-4
    num_steps = 5

    model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device)

    optimizer = optim.Adam(model.parameters())

    max_frames = 20000
    frame_idx = 0
    test_rewards = []
    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        #每个子网络运行num_steps个steps,实现n步采样
        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)
            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            #记录下这num_steps步的各子网络相关参数
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

            if frame_idx % 100 == 0:
                test_rewards.append(np.mean([test_env(model, env) for _ in range(10)]))
                plot(frame_idx, test_rewards)

        #将子网络的参数传给主网络,并进行参数更新
        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        #将5个step的值串起来
        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values
        #计算loss均值
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Esempio n. 8
0
def main():

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n
    num_rewards = len(task_rewards[mode])

    full_rollout = True

    env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards)
    env_model.load_state_dict(torch.load("env_model_" + mode))

    distil_policy = ActorCritic(envs.observation_space.shape,
                                envs.action_space.n)
    distil_optimizer = optim.Adam(distil_policy.parameters())

    imagination = ImaginationCore(1,
                                  state_shape,
                                  num_actions,
                                  num_rewards,
                                  env_model,
                                  distil_policy,
                                  full_rollout=full_rollout)

    actor_critic = I2A(state_shape,
                       num_actions,
                       num_rewards,
                       256,
                       imagination,
                       full_rollout=full_rollout)
    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    env_model     = env_model.cuda()
    #    distil_policy = distil_policy.cuda()
    #    actor_critic  = actor_critic.cuda()

    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e5)

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    current_state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(current_state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            #if USE_CUDA:
            #    current_state = current_state.cuda()
            action = actor_critic.act(autograd.Variable(current_state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            current_state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, current_state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_logit, _, _, _ = distil_policy.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_loss = 0.01 * (F.softmax(logit).detach() *
                              F.log_softmax(distil_logit)).sum(1).mean()

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        distil_optimizer.zero_grad()
        distil_loss.backward()
        optimizer.step()

        if i_update % 100 == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "i2a_" + mode)
Esempio n. 9
0
            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


if __name__ == '__main__':
	logger = Logger('./log')
	env = ProstheticsEnv(False)
	envs = [make_env() for i in range(NUM_ENVS)]
	envs = SubprocVecEnv(envs)
	model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
	optimizer = optim.Adam(model.parameters(), lr=lr)

	frame_idx = 0
	test_rewards = []

	state = envs.reset()

	while frame_idx < max_frames:

		log_probs = []
		values    = []
		states    = []
		actions   = []
		rewards   = []
Esempio n. 10
0
File: ppo.py Progetto: CAiM-lab/PPO
class PPO(object):
    """Main PPO class"""
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)

        #self.delta = (self.args.lr-self.args.lr_end)/1e6

    def train(self):
        """Main training function"""
        frame_idx = 0
        state = self.envs.reset()
        mean_100_reward = -np.inf
        self.info()

        while frame_idx < self.args.max_frames:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []
            entropy = self.args.entropy

            for _ in range(self.args.nb_steps):
                state = torch.FloatTensor(state).to(self.device)
                dist, value = self.actor_critic(state)
                action = dist.sample()
                # Make sure action is loaded to CPU (not GPU)
                next_state, reward, done, _ = self.envs.step(
                    action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(self.device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(self.device))

                states.append(state)
                actions.append(action)
                state = next_state
                frame_idx += 1
                #self.scheduler()

                # Evaluate training process and write data to tensorboard
                if frame_idx % 1000 == 0:
                    test_reward = np.mean(
                        [self.test_env(self.args.vis) for _ in range(10)])
                    self.test_rewards.append(test_reward)

                    if self.args.play is False:
                        print("Mean reward: ",
                              np.round(np.mean(self.test_rewards[-101:-1]), 0))
                        if mean_100_reward < np.round(
                                np.mean(self.test_rewards[-101:-1]), 0):
                            mean_100_reward = np.round(
                                np.mean(self.test_rewards[-101:-1]), 0)
                            self.save_network(mean_100_reward)
                        if len(self.test_rewards) >= 10:
                            self.writer.add_scalar(
                                'data/reward',
                                np.mean(self.test_rewards[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/ppo_loss', np.mean(self.loss[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/nb_actions_outside_range',
                                np.mean(self.action_bang_bang[-11:-1]),
                                frame_idx * self.args.num_envs)

                    # if test_reward > threshold_reward: early_stop = True

            next_state = torch.FloatTensor(next_state).to(self.device)
            _, next_value = self.actor_critic(next_state)
            returns = self.calc_gae(next_value, rewards, masks, values,
                                    self.args.gamma, self.args.tau)

            # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations
            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size,
                            states, actions, log_probs, returns, advantage,
                            self.args.clip)

    def make_env(self):
        # Private trunk function for calling the SubprocVecEnv class
        def _trunk():
            env = self.args.env  # in this simple case the class TestEnv() is called (see openAI for more envs)
            env.seed(self.args.seed)
            env.set_scaling(self.args.output_scaling)
            return env

        return _trunk

    def test_env(self, vis=False):
        state = self.env_test.reset()
        if vis:
            self.env_test.render()
        done = False
        total_reward = 0
        action_bang_bang = 0
        step = 0
        while not done:
            step += 1
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            dist, _ = self.actor_critic(state)
            action = dist.sample().cpu().numpy()[0]
            force = action * self.args.output_scaling
            next_state, reward, done, _ = self.env_test.step(action)
            if force > 0.5 or force < -0.5:
                action_bang_bang += 1
            state = next_state
            if vis:
                self.env_test.render()
            total_reward += reward
        self.action_bang_bang.append(action_bang_bang / step)
        return total_reward

    # Plain functions except that one can call them from an instance or the class
    @staticmethod
    def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    @staticmethod
    def ppo_iter(mini_batch_size, states, actions, log_probs, returns,
                 advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[
                rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self,
                   ppo_epochs,
                   mini_batch_size,
                   states,
                   actions,
                   log_probs,
                   returns,
                   advantages,
                   clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(
                    mini_batch_size, states, actions, log_probs, returns,
                    advantages):
                dist, value = self.actor_critic(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
                self.loss.append(loss.item())
                # Important step:
                self.optimizer.zero_grad()
                #pdb.set_trace()
                loss.backward()
                if self.args.grad_norm is not None:
                    nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                             self.args.grad_norm)
                self.optimizer.step()

    def save_network(self, reward):
        network_path = self.output_path + "/network" + str(reward)
        pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb"))

    def load_network(self, path):
        network_new = pickle.load(open(path, "rb"))
        self.actor_critic.load_state_dict(network_new)

    def random_seed(self):
        torch.manual_seed(self.args.seed)
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)

    def scheduler(self):
        for g in self.optimizer.param_groups:
            lr = g["lr"]
            if self.args.lr_end > lr:
                lr = self.args.lr_end
            else:
                lr -= self.delta
            self.lr.append(lr)
            g["lr"] = lr

    def info(self):
        fhandler = logging.FileHandler(filename=self.output_path +
                                       '/mylog.log',
                                       mode='a')
        logger.addHandler(fhandler)
        logger.info("--- INFO ---")
        logger.info("args: {}".format(self.args))
Esempio n. 11
0
    # https://github.com/openai/baselines/blob/f2729693253c0ef4d4086231d36e0a4307ec1cb3/baselines/acktr/utils.py
    num = (q_mu - p_mu)**2 + q_sigma**2 - p_sigma**2
    den = 2 * (p_sigma**2) + 1e-8
    kl = torch.mean(num/den + torch.log(p_sigma) - torch.log(q_sigma))
    return kl

def make_env():
    def _thunk():
        env = ActiveVisionDatasetEnv()
        return env
    return _thunk

if __name__ == "__main__":
    num_envs =6
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    print("!!!state_shape:",state_shape)
    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 10
    num_frames = int(10e7)

    #rmsprop hyperparams:
    lr    = 1e-4
    eps   = 1e-5
    alpha = 0.99
Esempio n. 12
0
def main():
    current_time = time.ctime().replace(":", "_")
    log_dir = "logs/PPO/{}".format(current_time)
    # tensorboard
    writer = SummaryWriter(log_dir=log_dir)

    # csv
    logfile_name = "{}/train_log.csv".format(log_dir)
    with open(logfile_name, 'w+', newline='') as f:
        csv_writer = csv.writer(f, delimiter=";")
        csv_writer.writerow([
            'update', 'running_loss', 'Reward', 'loss', 'actor_loss',
            'critic_loss', 'entropy_loss', 'time'
        ])

    ############## Hyperparameters ##############
    # env_name = "CartPole-v0"
    # creating environment
    envs = SubprocVecEnv([
        lambda: rpg.Environment('gym', "Neo"),
        lambda: rpg.Environment('gym', "Morpheus"),
        lambda: rpg.Environment('gym', "Trinity"),
        lambda: rpg.Environment('gym', "Oracle"),
        lambda: rpg.Environment('gym', "Cypher"),
        lambda: rpg.Environment('gym', "Tank"),
        lambda: rpg.Environment('gym', "Agent_Smith"),
        lambda: rpg.Environment('gym', "Dozer")
    ])

    env = VecPyTorch(envs, device)

    state_dim = (3, 64, 64)
    action_dim = env.action_space.n
    save_freq = 10000
    print_freq = 10
    max_episodes = 500001  # max training episodes
    max_timesteps = 5  # max timesteps in one episode
    n_latent_var = 256  # number of variables in hidden layer
    update_timestep = 15  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factror
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = 11
    actor_loss = 0
    critic_loss = 0
    entropy_loss = 0
    loss = 0
    #############################################

    if random_seed:
        os.environ['PYTHONHASHSEED'] = str(random_seed)
        random.seed(random_seed)
        numpy.random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    state, minimap = env.reset()

    # training loop
    for i_episode in range(1, max_episodes + 1):
        # state, minimap = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            dist, _ = ppo.policy_old(state, minimap)
            action = dist.sample()
            state, minimap, reward, done, _ = env.step(action.unsqueeze(1))
            memory.states.append(state)
            memory.maps.append(minimap)
            memory.actions.append(action)
            memory.logprobs.append(dist.log_prob(action))

            # Saving reward and is_terminal:
            memory.rewards.append(reward.to(device).squeeze())
            memory.is_terminals.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                loss, actor_loss, critic_loss, entropy_loss = ppo.update(
                    memory)
                memory.clear_memory()
                timestep = 0

            running_reward += reward.mean().item()

        # avg_length += t

        # logging
        if i_episode % print_freq == 0:
            print("********************************************************")
            print("episode: {0}".format(i_episode))
            print("mean/median reward: {:.1f}/{:.1f}".format(
                reward.mean(), reward.median()))
            print("min/max reward: {:.1f}/{:.1f}".format(
                reward.min(), reward.max()))
            print("actor loss: {:.5f}, critic loss: {:.5f}, entropy: {:.5f}".
                  format(actor_loss, critic_loss, entropy_loss))
            print("Loss: {0}".format(loss))
            print("********************************************************")

        # show data in tensorflow
        writer.add_scalar('Loss/Loss', loss, i_episode)
        writer.add_scalar('Loss/Actor Loss', actor_loss, i_episode)
        writer.add_scalar('Loss/Critic Loss', critic_loss, i_episode)
        writer.add_scalar('Loss/Entropy', entropy_loss, i_episode)
        writer.add_scalar('Reward/Running Reward', running_reward, i_episode)

        writer.add_scalar('Reward/Min', reward.min(), i_episode)
        writer.add_scalar('Reward/Max', reward.max(), i_episode)
        writer.add_scalar('Reward/Mean', reward.mean(), i_episode)
        writer.add_scalar('Reward/Median', reward.median(), i_episode)
        writer.add_scalar('Reward/Sum', reward.sum(), i_episode)

        with open(logfile_name, 'a+', newline='') as f:
            csv_writer = csv.writer(f, delimiter=";")
            csv_writer.writerow([
                i_episode, running_reward,
                reward.mean(), loss, actor_loss, critic_loss, entropy_loss,
                time.ctime()
            ])

        if save_freq > 0 and i_episode % save_freq == 0:
            torch.save(ppo.policy.state_dict(), '{}/model.pth'.format(log_dir))
            torch.save(ppo.policy_old.state_dict(),
                       '{}/model_old.pth'.format(log_dir))
            print("saved")
Esempio n. 13
0
env = gym.make(enviorment_name)
env.render(mode='human', close=False)  # to visulize 3d render


# fuction to create functions for making enviorment, for the multiprocessing lib
def make_env_list():
    def env_multiprocessing():
        env = gym.make(enviorment_name)
        return env

    return env_multiprocessing


# create container that contain the different parallel enviorments.
envs = [make_env_list() for i in range(num_envs)]
envs = SubprocVecEnv(envs)


#General Advantage Esitmator
def GAE(next_critic_value, rewards, masks, values, gamma, lambda_):
    gae = 0
    values_ = values + [next_critic_value]
    returns = []
    for k in reversed(range(len(rewards))):
        re = torch.transpose(rewards[k].unsqueeze(1), 0, 1)
        gv = gamma * torch.transpose(values_[k + 1], 0, 1) * masks[k]
        vv = torch.transpose(values_[k], 0, 1)
        delta = re + gv - vv  # "exponential decay"
        gae = delta + gamma * lambda_ * masks[k] * gae  # "smoothing"
        returns.append(gae + vv)
    return list(reversed(returns))
from multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"


def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)


class ActorCritic:
    def __init__(self,
                 sess,
                 obs,
                 acs,
                 hidden_size,
                 name,
                 trainable,
                 init_std=1.0):
        self.sess = sess
        self.obs = obs
Esempio n. 15
0
class env_cover():
    def __init__(self, config, dev):

        self.dev = dev
        self.num_env = config['num_envs']
        self.get_img_from_render = config['get_img_from_render']

        self.obs_shape = (self.num_env, ) + config['obs_space'][1:]
        #        print(self.obs_shape)
        self.reward_shape = (self.num_env, ) + config['reward_space'][1:]
        self.gamma_shape = (self.num_env, ) + config['gamma_space'][1:]

        if self.num_env == 1:
            self.env = gym.make(config['game_name'])
        else:

            def make_env():
                def _thunk():
                    env = gym.make(config['game_name'])
                    return env

                return _thunk

            envs = [make_env() for i in range(self.num_env)]
            self.env = SubprocVecEnv(envs)

#
#def obs_preproc(x):
#    if IMG_GET_RENDER ==False:
#        return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0)
#    x = np.dot(x, np.array([[0.299, 0.587, 0.114]]).T)
#    x = np.reshape(x, (1,x.shape[1], x.shape[0]))
#    return torch.from_numpy(np.resize(x, feature_state)).float().unsqueeze(0)/255
#

    def reset(self):
        st = self.env.reset()
        if self.get_img_from_render:
            st = self.env.render(mode='rgb_array')
            st = np.resize(st, self.obs_shape) / 255.

        return torch.FloatTensor(st).reshape(self.obs_shape).to(
            self.dev), torch.zeros(self.reward_shape).to(
                self.dev), torch.zeros(self.gamma_shape).to(self.dev)
        #return st, 0,False

#    def get_obs(self,obs):
#        return torch.from_numpy(obs).detach().float().view(1,config['obs_space'])

    def step(self, action):

        st, rt, dt, _ = self.env.step(action)

        if self.get_img_from_render:
            st = self.env.render(mode='rgb_array')
            st = np.resize(st, self.obs_shape) / 255.


#        print(st)
        st = torch.FloatTensor(st).reshape(self.obs_shape).to(self.dev)
        rt = torch.FloatTensor([rt]).reshape(self.reward_shape).to(self.dev)
        if self.num_env == 1:
            dt = torch.FloatTensor([dt]).reshape(self.gamma_shape).to(self.dev)
        else:
            dt = torch.FloatTensor(dt.astype(int)).reshape(
                self.gamma_shape).to(self.dev)

        return st, rt, dt

    def end_dummy(self):
        return torch.zeros(self.obs_shape).to(self.dev), torch.zeros(
            self.reward_shape).to(self.dev), torch.zeros(self.gamma_shape).to(
                self.dev)

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()
Esempio n. 16
0
for i in range(num_envs_possible):
    if (rospy.has_param("/GETjag" + str(i) + "/worker_ready")):
        if (rospy.get_param("/GETjag" + str(i) + "/worker_ready")):
            num_envs += 1
            print("worker_", num_envs)

def make_env(i):
    def _thunk():
        env =  robotEnv(i)
        return env

    return _thunk

envs = [make_env(i+1) for i in range(num_envs)]

envs = SubprocVecEnv(envs)

state_size_map  = envs.observation_space[0].shape[0] * envs.observation_space[1].shape[1]
state_size_depth  = envs.observation_space[1].shape[0] * envs.observation_space[1].shape[1]
state_size_goal   = envs.observation_space[2].shape[0]


num_outputs = envs.action_space.shape[0]

stack_size = 1

class image_stacker():
    def __init__(self, state_size, stack_size):
        self.stacked_frames = deque([np.zeros((state_size_map), dtype=np.float32) for i in range(stack_size)], maxlen=stack_size)
    def return_stacked_frame(self):
            return self.stacked_frames
Esempio n. 17
0
class RolloutCollector:
    def __init__(self, num_env_workers, make_env_func, agent, batch_size,
                 rollout_length, state_shape, action_shape, stats):
        ''' -one agent is assigned to a collector. 
            -a collector runs a bunch of envs in paralel to feed to that agent
            -you could run a bunch of collectors simultaniously, 
                |-  and then use weight mixing on the agents seperately
        '''
        #self.storage_device = torch.device("cpu")

        self.num_env_workers = num_env_workers
        self.envs = SubprocVecEnv(
            [make_env_func() for i in range(num_env_workers)])
        self.agent = agent
        self.batch_size = batch_size
        self.rollout_length = rollout_length
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.stats = stats

        self.buffer_full = False
        self.GAE_calculated = False

        self.gamma = 0.8
        self.tau = 0.8

        self.rollout_indices = np.zeros(batch_size)
        self.states = torch.zeros(
            (batch_size, rollout_length + 1, *state_shape),
            dtype=torch.float32).to(self.agent.device)
        self.actions = torch.zeros(
            (batch_size, rollout_length + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.log_probs = torch.zeros(
            (batch_size, rollout_length + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.values = torch.zeros((batch_size, rollout_length + 1, 1),
                                  dtype=torch.float32).to(self.agent.device)
        self.rewards = torch.zeros((batch_size, rollout_length + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)
        self.done_masks = torch.zeros(
            (batch_size, rollout_length + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.advantages = torch.zeros(
            (batch_size, rollout_length + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.returns = torch.zeros((batch_size, rollout_length + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)

        self.state = self.envs.reset()

    def collect_samples(self):
        if self.buffer_full:
            raise Exception(
                "tried to collect more samples when buffer already full")

        num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers)
        with torch.no_grad():
            for collection_run in range(num_runs_to_full):
                start_index = collection_run * self.num_env_workers
                end_index_exclusive = min(start_index + self.num_env_workers,
                                          self.batch_size)
                run_indices = torch.arange(start_index,
                                           end_index_exclusive,
                                           dtype=torch.long)
                worker_indices = run_indices % self.num_env_workers

                for rollout_idx in range(self.rollout_length + 1):
                    state = torch.Tensor(self.state).float().to(
                        self.agent.device)
                    policy_dist = self.agent.actor(state)
                    action = policy_dist.sample()
                    if self.agent.tanh_action_clamping:
                        action = torch.tanh(action)
                    else:
                        action = action.clamp(-1, 1)  #   depends on env
                    cpu_actions = action.cpu().numpy()
                    state_, reward, done, info = self.envs.step(cpu_actions)

                    value = self.agent.critic(state)
                    log_prob = policy_dist.log_prob(action)

                    reward = torch.Tensor(reward).float().unsqueeze(1).to(
                        self.agent.device)
                    done_masks = torch.Tensor(1.0 -
                                              done).float().unsqueeze(1).to(
                                                  self.agent.device)

                    self.states[run_indices,
                                rollout_idx] = state[worker_indices]
                    self.actions[run_indices,
                                 rollout_idx] = action[worker_indices]
                    self.log_probs[run_indices,
                                   rollout_idx] = log_prob[worker_indices]
                    self.values[run_indices,
                                rollout_idx] = value[worker_indices]
                    self.rewards[run_indices,
                                 rollout_idx] = reward[worker_indices]
                    self.done_masks[run_indices,
                                    rollout_idx] = done_masks[worker_indices]

                    self.state = state_

        self.buffer_full = True
        self.stats.update_collection_stats(
            num_samples_collected_inc=self.batch_size * self.rollout_length)

    def compute_gae(self):
        if not self.buffer_full:
            raise Exception(
                "buffer is not full of new samples yet (so not ready for GAE)")

        gae = torch.zeros((self.batch_size, 1)).to(self.agent.device)
        for i in reversed(range(self.rollout_length)):
            delta = self.rewards[:,
                                 i] + self.gamma * self.values[:, i +
                                                               1] * self.done_masks[:,
                                                                                    i] - self.values[:,
                                                                                                     i]
            gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae
            self.returns[:, i] = gae + self.values[:, i]
            self.advantages[:, i] = gae

        self.GAE_calculated = True

    def random_batch_iter(self):
        if not self.buffer_full and not self.GAE_calculated:
            raise Exception(
                "buffer is not ready for sampling yet. (not full/no GAE)")
        '''-theres no way all the workers are aligned, especially after an episode or so. 
            so we might just be able to use a vertical index'''
        batch_indices = torch.randperm(self.rollout_length)
        for i in range(self.rollout_length):
            index = batch_indices[i]
            state = self.states[:, index]
            action = self.actions[:, index]
            log_prob = self.log_probs[:, index]
            advantage = self.advantages[:, index]
            return_ = self.returns[:, index]
            yield state, action, log_prob, advantage, return_

    def reset(self):
        self.buffer_full = False
        self.GAE_calculated = False
#####################################################################

from multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

#####################################################################

class ActorCritic:
    def __init__(self, sess, obs, acs, hidden_size, name, trainable, init_std=1.0):
        self.sess = sess
        self.obs = obs
        self.acs = acs
        self.hidden_size = hidden_size
        self.name = name
        self.trainable = trainable
        self.init_std = init_std
Esempio n. 19
0
num_envs = 8
env_name = "CartPole-v0"


def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


plt.ion()
envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)  # 8 env

env = gym.make(env_name)  # a single env


class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(  # network that outputs value
            nn.Linear(num_inputs, hidden_size), nn.ReLU(),
            nn.Linear(hidden_size, 1))

        self.actor = nn.Sequential(  # network that outputs prob of action
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
Esempio n. 20
0
def create_envs(p, args, N):  #creates multiple environments for training
    urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
    envs = [make_env(p, urdf_path, args=args) for i in range(N)]
    envs = SubprocVecEnv(envs)

    return envs
Esempio n. 21
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape

    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e3)

    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99

    #Init a2c and rmsprop
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    actor_critic = actor_critic.cuda()

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            action = actor_critic.act(autograd.Variable(state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        if i_update % num_frames == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "actor_critic_" + mode)

    import time

    def displayImage(image, step, reward):
        #clear_output(True)
        s = "step: " + str(step) + " reward: " + str(reward)
        plt.figure(figsize=(10, 3))
        plt.title(s)
        plt.imshow(image)
        plt.show()
        time.sleep(0.1)

    env = MiniPacman(mode, 1000)

    done = False
    state = env.reset()
    total_reward = 0
    step = 1

    while not done:
        current_state = torch.FloatTensor(state).unsqueeze(0)
        #if USE_CUDA:
        #    current_state = current_state.cuda()

        action = actor_critic.act(autograd.Variable(current_state))

        next_state, reward, done, _ = env.step(action.data[0, 0])
        total_reward += reward
        state = next_state

        image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy()
        displayImage(image, step, total_reward)
        step += 1
class RolloutCollector:
    def __init__(self, num_env_workers, make_env_func, agent, batch_size,
                 rollout_length, num_recurrence_steps, state_shape,
                 action_shape, stats):
        ''' -one agent is assigned to a collector. 
            -a collector runs a bunch of envs in paralel to feed to that agent
            -you could run a bunch of collectors simultaniously, 
                |-  and then use weight mixing on the agents seperately
        '''
        self.num_env_workers = num_env_workers
        self.envs = SubprocVecEnv(
            [make_env_func() for i in range(num_env_workers)])
        self.agent = agent
        self.batch_size = batch_size
        self.rollout_length = rollout_length
        self.num_recurrence_steps = num_recurrence_steps
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.stats = stats

        self.buffer_full = False
        self.GAE_calculated = False

        self.gamma = 0.8
        self.tau = 0.8

        self.rollout_indices = np.zeros(batch_size)
        self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1
        self.states = torch.zeros(
            (batch_size, self.buffer_width + 1, *state_shape),
            dtype=torch.float32).to(self.agent.device)
        self.actions = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.log_probs = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.values = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                  dtype=torch.float32).to(self.agent.device)
        self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)
        self.done_masks = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.advantages = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)

        self.state = self.envs.reset()
        self.hidden_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)
        self.cell_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)

    def collect_samples(self):
        if self.buffer_full:
            raise Exception(
                "tried to collect more samples when buffer already full")
        num_runs_to_full = math.ceil(self.batch_size / self.num_env_workers)
        with torch.no_grad():

            self.hidden_state = torch.zeros(
                (1, self.num_env_workers,
                 self.agent.hidden_state_size)).to(self.agent.device)
            self.cell_state = torch.zeros(
                (1, self.num_env_workers,
                 self.agent.hidden_state_size)).to(self.agent.device)

            for collection_run in range(num_runs_to_full):
                start_index = collection_run * self.num_env_workers
                end_index_exclusive = min(start_index + self.num_env_workers,
                                          self.batch_size)
                run_indices = torch.arange(start_index,
                                           end_index_exclusive,
                                           dtype=torch.long)
                worker_indices = run_indices % self.num_env_workers

                for rollout_idx in range(self.buffer_width + 1):
                    state = torch.Tensor(self.state).float().to(
                        self.agent.device)

                    #   for recurrences
                    lstm_input = state.view(-1, 1, *self.state_shape)
                    output, (hidden, cell) = self.agent.lstm(
                        lstm_input, (self.hidden_state, self.cell_state))
                    output = output.reshape(self.num_env_workers,
                                            self.agent.hidden_state_size)

                    policy_dist = self.agent.actor(output)
                    action = policy_dist.sample()
                    action = action.clamp(-1, 1)  #   depends on env
                    state_, reward, done, info = self.envs.step(
                        action.cpu().numpy())

                    value = self.agent.critic(output)
                    log_prob = policy_dist.log_prob(action)

                    reward = torch.Tensor(reward).float().unsqueeze(1).to(
                        self.agent.device)
                    done_masks = torch.Tensor(1.0 -
                                              done).float().unsqueeze(1).to(
                                                  self.agent.device)

                    self.states[run_indices,
                                rollout_idx] = state[worker_indices]
                    self.actions[run_indices,
                                 rollout_idx] = action[worker_indices]
                    self.log_probs[run_indices,
                                   rollout_idx] = log_prob[worker_indices]
                    self.values[run_indices,
                                rollout_idx] = value[worker_indices]
                    self.rewards[run_indices,
                                 rollout_idx] = reward[worker_indices]
                    self.done_masks[run_indices,
                                    rollout_idx] = done_masks[worker_indices]

                    self.hidden_state[0, worker_indices] *= self.done_masks[
                        run_indices,
                        rollout_idx].expand(-1, self.agent.hidden_state_size)
                    self.cell_state[0, worker_indices] *= self.done_masks[
                        run_indices,
                        rollout_idx].expand(-1, self.agent.hidden_state_size)
                    self.state = state_

        self.buffer_full = True
        self.stats.update_collection_stats(
            num_samples_collected_inc=self.batch_size * self.rollout_length)

    def compute_gae(self):
        if not self.buffer_full:
            raise Exception(
                "buffer is not full of new samples yet (so not ready for GAE)")

        gae = torch.zeros((self.batch_size, 1)).to(self.agent.device)
        for i in reversed(range(self.buffer_width)):
            delta = self.rewards[:,
                                 i] + self.gamma * self.values[:, i +
                                                               1] * self.done_masks[:,
                                                                                    i] - self.values[:,
                                                                                                     i]
            gae = delta + self.gamma * self.tau * self.done_masks[:, i] * gae
            self.returns[:, i] = gae + self.values[:, i]
            self.advantages[:, i] = gae

        self.GAE_calculated = True

    def get_leading_states(self, index):
        indices_with_leading_states = torch.arange(
            self.num_recurrence_steps) - self.num_recurrence_steps + 1 + index
        leading_states = self.states[:, indices_with_leading_states]

        #   some of the leading states might be from previous episodes
        #   #   in which case, we dont want to consider those at all.
        leading_state_indices = indices_with_leading_states[:-1]
        leading_dones = 1 - self.done_masks[:, leading_state_indices]
        last_leading_dones = leading_dones.nonzero()[:, :2]
        for batch_index, last_done in last_leading_dones:
            previous_episode_indices = torch.arange(last_done + 1)
            leading_states[batch_index, previous_episode_indices] = 0

        return leading_states

    def random_batch_iter(self):
        if not self.buffer_full and not self.GAE_calculated:
            raise Exception(
                "buffer is not ready for sampling yet. (not full/no GAE)")
        '''-theres no way all the workers are aligned, especially after an episode or so. 
            so we might just be able to use a vertical index'''
        batch_indices = torch.randperm(self.rollout_length)

        #   recurrence stuff
        if self.num_recurrence_steps > 0:
            batch_indices = torch.randperm(
                self.rollout_length) + self.num_recurrence_steps - 1
            self.hidden_state = torch.zeros(
                (1, self.batch_size,
                 self.agent.hidden_state_size)).to(self.agent.device)
            self.cell_state = torch.zeros(
                (1, self.batch_size,
                 self.agent.hidden_state_size)).to(self.agent.device)

        for i in range(self.rollout_length):
            index = batch_indices[i]
            leading_states = self.get_leading_states(index)
            output, (hidden, cell) = self.agent.lstm(
                leading_states, (self.hidden_state, self.cell_state))
            state = output[:, -1, :]

            action = self.actions[:, index]
            log_prob = self.log_probs[:, index]
            advantage = self.advantages[:, index]
            return_ = self.returns[:, index]
            yield state, action, log_prob, advantage, return_

    def reset(self):
        self.buffer_full = False
        self.GAE_calculated = False
Esempio n. 23
0
File: train.py Progetto: ProxJ/play
def train(env, agent, flags):
    """"""

    # set random seeds (for reproducibility)
    torch.manual_seed(flags['seed'])
    torch.cuda.manual_seed_all(flags['seed'])
    envs = [make_env(flags['env'], flags['seed'], i) for i in range(flags['num_envs'])]
    envs = SubprocVecEnv(envs)

    # instantiate the policy and optimiser
    num_inputs  = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    current_step_number = 0
    test_rewards = []
    state = envs.reset()

    
    while current_step_number < flags['max_steps']:
        
        log_probs = []
        values    = []
        rewards   = []
        masks     = []
        entropy = 0

        for _ in range(flags['num_step_td_update']):

            # sample an action from the distribution
            action = agent.act(state)
            # take a step in the environment
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
                
            # compute the log probability
            log_prob = dist.log_prob(action)
            # compute the entropy
            entropy += dist.entropy().mean()
            
            # save the log probability, value and reward 
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))



            # if done, save episode rewards

            state = next_state
            current_step_number += 1
            
            if current_step_number % 1000 and flags['plot_test'] == 0:
                test_rewards.append(np.mean([test_env(model) for _ in range(10)]))
                plot(current_step_number, test_rewards)

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
   
        # calculate the discounted return of the episode
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns   = torch.cat(returns).detach()
        values    = torch.cat(values)

        advantage = returns - values

        actor_loss  = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        # loss function
        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return rewards
    parser = argparse.ArgumentParser()
    parser.add_argument("--epoch", default=int(5e5), type=int)
    args = parser.parse_args()

    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape

    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 10
    num_frames = args.epoch

    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99
Esempio n. 25
0
        num_boids = np.random.randint(MIN_NUM_BOIDS, MAX_NUM_BOIDS + 1)
        num_spheres = np.random.randint(MIN_NUM_SPHERES, MAX_NUM_SPHERES + 1)

        env_num_boids.append(num_boids)

        envs.append(make_env(num_boids, num_spheres))

        edges = utils.system_edges(NUM_GOALS, num_spheres, num_boids)
        edge_types = one_hot(edges, EDGE_TYPES)
        padded_edge_types.append(
            utils.pad_data(edge_types, MAX_NUM_NODES, dims=[0, 1]))

        mask = utils.get_mask(num_boids, MAX_NUM_NODES)
        masks.append(mask)

    envs = SubprocVecEnv(envs)
    padded_edge_types = np.array(padded_edge_types)
    masks = np.array(masks)

    swarmnet_params = load_model_params('config/il_rl.json')
    actorcritic = get_swarmnet_actorcritic(swarmnet_params,
                                           '../../Logs/swarmnet_rl_test')

    swarmnet_agent = PPOAgent(actorcritic,
                              NDIM,
                              action_bound=None,
                              rollout_steps=ROLLOUT_STEPS,
                              memory_capacity=4096,
                              summary_writer=None,
                              mode=0)
Esempio n. 26
0
num_envs = 16
env_name = "Pendulum-v0"

# TODO : rajouter env reset


def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)


def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20, 5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()


def test_env(vis=False):
    state = env.reset()
Esempio n. 27
0
if __name__ == "__main__":
    mkdir('.', 'checkpoints')
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--name", default=ENV_ID, help="Name of the run")
    args = parser.parse_args()
    writer = SummaryWriter(comment="ppo_" + args.name)
    
    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device   = torch.device("cuda" if use_cuda else "cpu")
    print('Device:', device)
    
    # Prepare environments
    envs = [make_env() for i in range(NUM_ENVS)]
    envs = SubprocVecEnv(envs)
    env = gym.make(ENV_ID)
    env.n_foods = 10
    obs_ = env.reset()
    num_inputs  = obs_.shape
    num_outputs = env.action_space.shape[0]

    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE, std=0.1).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    frame_idx  = 0
    train_epoch = 0
    best_reward = None

    state = envs.reset()
Esempio n. 28
0
                        default=False,
                        help="Use multi process")

    args = parser.parse_args()
    writer = SummaryWriter(comment="ppo_connectx")

    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print('Device:', device)

    # Prepare environments
    envs = [make_env() for i in range(args.envs)]
    envs = MultiEnv(envs)
    if args.mp:
        envs = SubprocVecEnv(envs)
    env = OhlcvEnv(WINDOW_SIZE, './data/test/')
    obs_ = env.reset()
    num_inputs = env.observation_space.shape
    num_outputs = env.action_space.n

    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE,
                        std=0.0).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    frame_idx = 0
    train_epoch = 0
    best_reward = None

    state = envs.reset()
Esempio n. 29
0
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


if __name__ == '__main__':
    logger = Logger('./log')
    env = ProstheticsEnv(False)
    envs = [make_env() for i in range(NUM_ENVS)]
    envs = SubprocVecEnv(envs)
    model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    frame_idx = 0
    test_rewards = []

    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        states = []
        actions = []
        rewards = []
Esempio n. 30
0
num_envs = 8
env_name = "CartPole-v0"


def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


plt.ion()
envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)  # 8 env

env = gym.make(env_name)  # a single env


class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(nn.Linear(num_inputs, hidden_size),
                                    nn.ReLU(), nn.Linear(hidden_size, 1))

        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
Esempio n. 31
0
def dqn_algorithm(ENV_NAME,
                  NUM_ENV=8,
                  SEED=1,
                  TOTAL_TIMESTEPS=100000,
                  GAMMA=0.95,
                  MEMORY_SIZE=1000,
                  BATCH_SIZE=32,
                  EXPLORATION_MAX=1.0,
                  EXPLORATION_MIN=0.02,
                  EXPLORATION_FRACTION=0.7,
                  TRAINING_FREQUENCY=1000,
                  FILE_PATH='results/',
                  SAVE_MODEL=False,
                  MODEL_FILE_NAME='model',
                  LOG_FILE_NAME='log',
                  TIME_FILE_NAME='time',
                  PRINT_FREQ=100,
                  N_EP_AVG=100,
                  VERBOSE='False',
                  MLP_LAYERS=[64, 64],
                  MLP_ACTIVATIONS=['relu', 'relu'],
                  LEARNING_RATE=1e-3,
                  EPOCHS=1,
                  GRAD_CLIP=False,
                  DOUBLE_DQN=False,
                  USE_TARGET_NETWORK=True,
                  TARGET_UPDATE_FREQUENCY=5000,
                  LOAD_WEIGHTS=False,
                  LOAD_WEIGHTS_MODEL_PATH='results/model0.h5'):
    '''
    DQN Algorithm execution

    env_name : string for a gym environment
    num_env : no. for environment vectorization (multiprocessing env)
    total_timesteps : Total number of timesteps
    training_frequency : frequency of training (experience replay)
    gamma : discount factor : 
    buffer_size : Replay buffer size 
    batch_size : batch size for experience replay 
    exploration_max : maximum exploration at the begining 
    exploration_min : minimum exploration at the end 
    exploration_fraction : fraction of total timesteps on which the exploration decay takes place 
    output_folder : output filepath 
    save_model : boolean to specify whether the model is to be saved 
    model_file_name : name of file to save the model at the end learning 
    log_file_name : name of file to store DQN results 
    time_file_name : name of file to store computation time 
    print_frequency : results printing episodic frequency 
    n_ep_avg : no. of episodes to be considered while computing average reward 
    verbose : print episodic results 
    mlp_layers : list of neurons in each hodden layer of the DQN network 
    mlp_activations : list of activation functions in each hodden layer of the DQN network 
    learning_rate : learning rate for the neural network 
    epochs : no. of epochs in every experience replay 
    grad_clip : boolean to specify whether to use gradient clipping in the optimizer (graclip value 10.0) 
    double_dqn : boolean to specify whether to employ double DQN 
    use_target_network : boolean to use target neural network in DQN 
    target_update_frequency : timesteps frequency to do weight update from online network to target network 
    load_weights : boolean to specify whether to use a prespecified model to initializa the weights of neural network 
    load_weights_model_path : path for the model to use for weight initialization 
    '''

    before = time.time()
    num_envs = NUM_ENV
    env_name = ENV_NAME

    if TOTAL_TIMESTEPS % NUM_ENV:
        print('Error: total timesteps is not divisible by no. of envs')
        return

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            env.seed(SEED)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    # for reproducibility
    set_seed(SEED)

    observation_space = envs.observation_space.shape[0]
    action_space = envs.action_space.n

    dqn_solver = DQNSolver(observation_space, action_space, MLP_LAYERS,
                           MLP_ACTIVATIONS, LEARNING_RATE, EPOCHS,
                           USE_TARGET_NETWORK, GRAD_CLIP, DOUBLE_DQN,
                           LOAD_WEIGHTS, LOAD_WEIGHTS_MODEL_PATH,
                           TOTAL_TIMESTEPS, MEMORY_SIZE, BATCH_SIZE, GAMMA,
                           EXPLORATION_MAX, EXPLORATION_MIN,
                           EXPLORATION_FRACTION)

    envs = ParallelEnvWrapper(envs)
    t = 0
    episode_rewards = [0.0] * num_envs
    explore_percent, episodes, mean100_rew, steps, NN_tr_loss = [],[],[],[],[]
    while True:
        state = envs.reset()
        # state = np.reshape(state, [1, observation_space])
        while True:
            t += num_envs
            dqn_solver.eps_timestep_decay(t)
            action = dqn_solver.act(state)
            state_next, reward, terminal, _ = envs.step(action)
            # print(terminal)
            # reward = reward if not terminal else -reward
            # state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            if t % TRAINING_FREQUENCY == 0:
                dqn_solver.experience_replay()
            state = state_next
            episode_rewards[-num_envs:] = [
                i + j for (i, j) in zip(episode_rewards[-num_envs:], reward)
            ]
            # num_episodes = len(episode_rewards)
            # print(terminal)
            if (t % PRINT_FREQ == 0):
                explore_percent.append(dqn_solver.exploration_rate * 100)
                episodes.append(len(episode_rewards))
                mean100_rew.append(
                    round(np.mean(episode_rewards[(-1 - N_EP_AVG):-1]), 1))
                steps.append(t)
                NN_tr_loss.append(dqn_solver.loss)
                if VERBOSE:
                    print('Exploration %: ' + str(int(explore_percent[-1])) +
                          ' ,Episodes: ' + str(episodes[-1]) +
                          ' ,Mean_reward: ' + str(mean100_rew[-1]) +
                          ' ,timestep: ' + str(t) + ' , tr_loss: ' +
                          str(round(NN_tr_loss[-1], 4)))

            if t > TOTAL_TIMESTEPS:
                output_table = np.stack((steps, mean100_rew, episodes,
                                         explore_percent, NN_tr_loss))
                if not os.path.exists(FILE_PATH):
                    os.makedirs(FILE_PATH)
                file_name = str(FILE_PATH) + LOG_FILE_NAME + '.csv'
                np.savetxt(
                    file_name,
                    np.transpose(output_table),
                    delimiter=',',
                    header=
                    'Timestep,Rewards,Episodes,Exploration %,Training Score')
                after = time.time()
                time_taken = after - before
                np.save(str(FILE_PATH) + TIME_FILE_NAME, time_taken)
                if SAVE_MODEL:
                    file_name = str(FILE_PATH) + MODEL_FILE_NAME + '.h5'
                    dqn_solver.model.save(file_name)
                return dqn_solver.model
            if USE_TARGET_NETWORK and t % TARGET_UPDATE_FREQUENCY == 0:
                dqn_solver.update_target_network()
            # print(t)
            if terminal.all():
                episode_rewards += [0.0] * num_envs
                break