コード例 #1
0
    def rollout(self, net):
        """
        rollout handles the actual rollout of the environment for n
        steps in time.

        net - torch Module object. This is the model to interact with the
            environment.
        """
        net.eval()
        state = next_state(self.env,
                           self.obs_deque,
                           obs=None,
                           reset=True,
                           preprocess=self.hyps['preprocess'])
        ep_rew = 0
        hyps = self.hyps
        is_recurrent = hasattr(net, "fresh_h")
        if not is_recurrent:
            h = None
        else:
            h = net.fresh_h()
        t = 0
        episode_count = 1
        while t <= 400:
            t += 1
            state = cuda_if(torch.FloatTensor(state))
            if is_recurrent:
                val, logits, h = net(state[None], h=cuda_if(h.detach().data))
            else:
                val, logits = net(state[None])
            if self.hyps['discrete_env']:
                probs = F.softmax(logits, dim=-1)
                action = sample_action(probs.data)
                action = int(action.item())
            else:
                mu, sig = logits
                action = mu + torch.randn_like(sig) * sig
                action = action.cpu().detach().numpy().squeeze()
                if len(action.shape) == 0:
                    action = np.asarray([float(action)])
            obs, rew, done, info = self.env.step(action + hyps['action_shift'])
            if hyps['render']:
                self.env.render()
            ep_rew += rew
            reset = done
            if "Pong" in hyps['env_type'] and rew != 0:
                done = True
            if done:
                episode_count += 1

            state = next_state(self.env,
                               self.obs_deque,
                               obs=obs,
                               reset=reset,
                               preprocess=hyps['preprocess'])
        return ep_rew / episode_count, ep_rew / t
コード例 #2
0
    def run(self, net):
        """
        run is the entry function to begin collecting rollouts from the
        environment using the specified net. gate_q indicates when to begin
        collecting a rollout and is controlled from the main process.
        The stop_q is used to indicate to the main process that a new rollout
        has been collected.

        net - torch Module object. This is the model to interact with the
            environment.
        """
        self.net = net
        self.env = gym.make(self.hyps['env_type'])
        state = next_state(self.env,
                           self.obs_deque,
                           obs=None,
                           reset=True,
                           preprocess=self.hyps['preprocess'])
        self.state_bookmark = state
        self.h_bookmark = None
        if self.net.is_recurrent:
            self.h_bookmark = Variable(cuda_if(torch.zeros(1,
                                                           self.net.h_size)))
        self.ep_rew = 0
        #self.net.train(mode=False) # fixes potential batchnorm and dropout issues
        for p in self.net.parameters():  # Turn off gradient collection
            p.requires_grad = False
        while True:
            idx = self.gate_q.get()  # Opened from main process
            self.rollout(self.net, idx, self.hyps)
            self.stop_q.put(
                idx)  # Signals to main process that data has been collected
コード例 #3
0
    def test(self):
        ob = self.test_env.reset()
        done = False
        ep_reward = 0
        last_action = np.array([-1])
        action_repeat = 0

        while not done:
            ob = np.array(ob)
            ob = torch.from_numpy(ob.transpose((2, 0, 1))).float().unsqueeze(0)
            ob = Variable(ob / 255., volatile=True)
            ob = cuda_if(ob, self.cuda)

            pi, v = self.policy(ob)
            _, action = torch.max(pi, dim=1)

            # abort after {self.test_repeat_max} discrete action repeats
            if action.data[0] == last_action.data[0]:
                action_repeat += 1
                if action_repeat == self.test_repeat_max:
                    return ep_reward
            else:
                action_repeat = 0
            last_action = action

            ob, reward, done, _ = self.test_env.step(action.data.cpu().numpy())

            ep_reward += reward

        return ep_reward
コード例 #4
0
    def interact(self):
        """ Interacts with the environment

        Returns:
            obs (ArgumentDefaultsHelpFormatternsor): observations shaped [T + 1 x N x ...]
            rewards (FloatTensor): rewards shaped [T x N x 1]
            masks (FloatTensor): continuation masks shaped [T x N x 1]
                zero at done timesteps, one otherwise
            actions (LongTensor): discrete actions shaped [T x N x 1]
            steps (int): total number of steps taken
        """
        N = self.num_workers
        T = self.worker_steps

        # TEMP needs to be generalized, does conv-specific transpose for PyTorch
        obs = torch.zeros(T + 1, N, 4, 84, 84)
        obs = cuda_if(obs, self.cuda)
        rewards = torch.zeros(T, N, 1)
        rewards = cuda_if(rewards, self.cuda)
        masks = torch.zeros(T, N, 1)
        masks = cuda_if(masks, self.cuda)
        actions = torch.zeros(T, N, 1).long()
        actions = cuda_if(actions, self.cuda)

        for t in range(T):
            # interaction logic
            ob = torch.from_numpy(self.last_ob.transpose((0, 3, 1, 2))).float()
            ob = Variable(ob / 255.)
            ob = cuda_if(ob, self.cuda)
            obs[t] = ob.data

            pi, v = self.policy(ob)
            u = cuda_if(torch.rand(pi.size()), self.cuda)
            _, action = torch.max(pi.data - (-u.log()).log(), 1)
            action = action.unsqueeze(1)
            actions[t] = action

            self.last_ob, reward, done, _ = self.venv.step(
                action.cpu().numpy())
            reward = torch.from_numpy(reward).unsqueeze(1)
            rewards[t] = torch.clamp(reward, min=-1., max=1.)
            masks[t] = mask = torch.from_numpy((1. - done)).unsqueeze(1)

        ob = torch.from_numpy(self.last_ob.transpose((0, 3, 1, 2))).float()
        ob = Variable(ob / 255.)
        ob = cuda_if(ob, self.cuda)
        obs[T] = ob.data

        steps = N * T

        return obs, rewards, masks, actions, steps
コード例 #5
0
    def gae(self, rewards, values, next_vals, dones, gamma, lambda_):
        """
        Performs Generalized Advantage Estimation
    
        rewards - torch FloatTensor of actual rewards collected. Size = L
        values - torch FloatTensor of value predictions. Size = L
        next_vals - torch FloatTensor of value predictions. Size = L
        dones - torch FloatTensor of done signals. Size = L
        gamma - float discount factor
        lambda_ - float gae moving average factor
    
        Returns
         advantages - torch FloatTensor of genralized advantage estimations. Size = L
        """

        deltas = rewards + gamma * next_vals * (1 - dones) - values
        return cuda_if(discount(deltas, dones, gamma * lambda_))
コード例 #6
0
    def rollout(self, net, idx, hyps):
        """
        rollout handles the actual rollout of the environment for n steps in time.
        It is called from run and performs a single rollout, placing the
        collected data into the shared lists found in the datas dict.

        net - torch Module object. This is the model to interact with the
            environment.
        idx - int identification number distinguishing the
            portion of the shared array designated for this runner
        hyps - dict object with all necessary hyperparameters
                Keys (Assume string type keys):
                    "gamma" - reward decay coeficient
                    "n_tsteps" - number of steps to be taken in the
                                environment
                    "n_frame_stack" - number of frames to stack for
                                creation of the mdp state
                    "preprocess" - function to preprocess raw observations
        """
        state = self.state_bookmark
        h = self.h_bookmark
        n_tsteps = hyps['n_tsteps']
        startx = idx * n_tsteps
        prev_val = None
        for i in range(n_tsteps):
            self.datas['states'][startx + i] = cuda_if(
                torch.FloatTensor(state))
            state_in = Variable(self.datas['states'][startx + i]).unsqueeze(0)
            if 'h_states' in self.datas:
                self.datas['h_states'][startx + i] = h.data[0]
                h_in = Variable(h.data)
                val, logits, h = net(state_in, h_in)
            else:
                val, logits = net(state_in)
            probs = F.softmax(logits, dim=-1)
            action = sample_action(probs.data)
            action = int(action.item())
            obs, rew, done, info = self.env.step(action + hyps['action_shift'])
            if hyps['render']:
                self.env.render()
            self.ep_rew += rew
            reset = done
            if "Pong" in hyps['env_type'] and rew != 0:
                done = True
            if done:
                self.rew_q.put(.99 * self.rew_q.get() + .01 * self.ep_rew)
                self.ep_rew = 0
                # Reset Recurrence
                if h is not None:
                    h = Variable(cuda_if(torch.zeros(1, self.net.h_size)))

            self.datas['rewards'][startx + i] = rew
            self.datas['dones'][startx + i] = float(done)
            self.datas['actions'][startx + i] = action
            state = next_state(self.env,
                               self.obs_deque,
                               obs=obs,
                               reset=reset,
                               preprocess=hyps['preprocess'])
            if i > 0:
                prev_rew = self.datas['rewards'][startx + i - 1]
                prev_done = self.datas['dones'][startx + i - 1]
                delta = prev_rew + hyps['gamma'] * val.data * (
                    1 - prev_done) - prev_val
                self.datas['deltas'][startx + i - 1] = delta
            prev_val = val.data.squeeze()

        # Funky bootstrapping
        endx = startx + n_tsteps - 1
        if not done:
            state_in = Variable(cuda_if(torch.FloatTensor(state))).unsqueeze(0)
            if 'h_states' in self.datas:
                val, logits, _ = net(state_in, Variable(h.data))
            else:
                val, logits = net(state_in)
            self.datas['rewards'][endx] += hyps['gamma'] * val.squeeze(
            )  # Bootstrap
            self.datas['dones'][endx] = 1.
        self.datas['deltas'][endx] = self.datas['rewards'][endx] - prev_val
        self.state_bookmark = state
        if h is not None:
            self.h_bookmark = h.data
コード例 #7
0
ファイル: main.py プロジェクト: yufeiwang63/PPO
cuda = torch.cuda.is_available() and not args.no_cuda

env_fns = []
for rank in range(args.num_workers):
    env_fns.append(lambda: make_env(args.env_id, rank, args.seed + rank))
if args.render:
    venv = RenderSubprocVecEnv(env_fns, args.render_interval)
else:
    venv = SubprocVecEnv(env_fns)
venv = VecFrameStack(venv, 4)

test_env = make_env(args.env_id, 0, args.seed)
test_env = FrameStack(test_env, 4)

policy = {'cnn': AtariCNN}[args.arch](venv.action_space.n)
policy = cuda_if(policy, cuda)

optimizer = optim.Adam(policy.parameters())

if args.lr_func == 'linear':
    lr_func = lambda a: args.lr * (1. - a)
elif args.lr_func == 'constant':
    lr_func = lambda a: args.lr

if args.clip_func == 'linear':
    clip_func = lambda a: args.clip * (1. - a)
elif args.clip_func == 'constant':
    clip_func = lambda a: args.clip

algorithm = PPO(policy,
                venv,
コード例 #8
0
    def run(self, total_steps):
        """ Runs PPO

        Args:
            total_steps (int): total number of environment steps to run for
        """
        N = self.num_workers
        T = self.worker_steps
        E = self.opt_epochs
        A = self.venv.action_space.n

        while self.taken_steps < total_steps:
            progress = self.taken_steps / total_steps

            obs, rewards, masks, actions, steps = self.interact()
            ob_shape = obs.size()[2:]

            ep_reward = self.test()
            self.reward_histr.append(ep_reward)
            self.steps_histr.append(self.taken_steps)

            # statistic logic
            group_size = len(self.steps_histr) // self.plot_points
            if self.plot_reward and len(self.steps_histr) % (
                    self.plot_points * 10) == 0 and group_size >= 10:
                x_means, _, y_means, y_stds = \
                    mean_std_groups(np.array(self.steps_histr), np.array(self.reward_histr), group_size)
                fig = plt.figure()
                fig.set_size_inches(8, 6)
                plt.ticklabel_format(axis='x', style='sci', scilimits=(-2, 6))
                plt.errorbar(x_means,
                             y_means,
                             yerr=y_stds,
                             ecolor='xkcd:blue',
                             fmt='xkcd:black',
                             capsize=5,
                             elinewidth=1.5,
                             mew=1.5,
                             linewidth=1.5)
                plt.title('Training progress')
                plt.xlabel('Total steps')
                plt.ylabel('Episode reward')
                plt.savefig(self.plot_path, dpi=200)
                plt.clf()
                plt.close()
                plot_timer = 0

            # TEMP upgrade to support recurrence

            # compute advantages, returns with GAE
            obs_ = obs.view(((T + 1) * N, ) + ob_shape)
            obs_ = Variable(obs_)
            _, values = self.policy(obs_)
            values = values.view(T + 1, N, 1)
            advantages, returns = gae(rewards, masks, values, self.gamma,
                                      self.lambd)

            self.policy_old.load_state_dict(self.policy.state_dict())
            for e in range(E):
                self.policy.zero_grad()

                MB = steps // self.minibatch_steps

                b_obs = Variable(obs[:T].view((steps, ) + ob_shape))
                b_rewards = Variable(rewards.view(steps, 1))
                b_masks = Variable(masks.view(steps, 1))
                b_actions = Variable(actions.view(steps, 1))
                b_advantages = Variable(advantages.view(steps, 1))
                b_returns = Variable(returns.view(steps, 1))

                b_inds = np.arange(steps)
                np.random.shuffle(b_inds)

                for start in range(0, steps, self.minibatch_steps):
                    mb_inds = b_inds[start:start + self.minibatch_steps]
                    mb_inds = cuda_if(
                        torch.from_numpy(mb_inds).long(), self.cuda)
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_advantages, mb_returns = \
                        [arr[mb_inds] for arr in [b_obs, b_rewards, b_masks, b_actions, b_advantages, b_returns]]

                    mb_pis, mb_vs = self.policy(mb_obs)
                    mb_pi_olds, mb_v_olds = self.policy_old(mb_obs)
                    mb_pi_olds, mb_v_olds = mb_pi_olds.detach(
                    ), mb_v_olds.detach()

                    losses = self.objective(self.clip_func(progress), mb_pis,
                                            mb_vs, mb_pi_olds, mb_v_olds,
                                            mb_actions, mb_advantages,
                                            mb_returns)
                    policy_loss, value_loss, entropy_loss = losses
                    loss = policy_loss + value_loss * self.value_coef + entropy_loss * self.entropy_coef

                    set_lr(self.optimizer, self.lr_func(progress))
                    self.optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm(self.policy.parameters(),
                                                  self.max_grad_norm)
                    self.optimizer.step()

            self.taken_steps += steps
            print(self.taken_steps)
コード例 #9
0
    def rollout(self, net, idx, hyps):
        """
        rollout handles the actual rollout of the environment for n steps in time.
        It is called from run and performs a single rollout, placing the
        collected data into the shared lists found in the datas dict.

        net - torch Module object. This is the model to interact with the
            environment.
        idx - int identification number distinguishing the
            portion of the shared array designated for this runner
        hyps - dict object with all necessary hyperparameters
                Keys (Assume string type keys):
                    "gamma" - reward decay coeficient
                    "n_tsteps" - number of steps to be taken in the
                                environment
                    "n_frame_stack" - number of frames to stack for
                                creation of the mdp state
                    "preprocess" - function to preprocess raw observations
        """
        net.eval()
        hyps = self.hyps
        state = self.state_bookmark
        n_tsteps = hyps['n_tsteps']
        is_recurrent = hasattr(net, "fresh_h")
        if not is_recurrent:
            h = None
        else:
            h = self.prev_h if self.prev_h is not None else net.fresh_h()
        startx = idx * n_tsteps
        for i in range(n_tsteps):
            self.datas['states'][startx + i] = cuda_if(
                torch.FloatTensor(state))
            if is_recurrent:
                self.datas["hs"][startx + i] = cuda_if(h.detach().data)
                val, logits, h = net(self.datas['states'][startx + i][None],
                                     h=self.datas['hs'][startx + i][None])
                self.datas["next_hs"][startx + i] = cuda_if(h.detach().data)
            else:
                val, logits = net(self.datas['states'][startx + i][None])
            if self.hyps['discrete_env']:
                probs = F.softmax(logits, dim=-1)
                action = sample_action(probs.data)
                action = int(action.item())
            else:
                mu, sig = logits
                action = mu + torch.randn_like(sig) * sig
                action = action.cpu().detach().numpy().squeeze()
                if len(action.shape) == 0:
                    action = np.asarray([float(action)])
            obs, rew, done, info = self.env.step(action + hyps['action_shift'])
            if hyps['render']:
                self.env.render()
            self.ep_rew += rew
            self.datas['rews'][startx + i] = float(rew)
            reset = done
            if "Pong" in hyps['env_type'] and rew != 0:
                done = True
            if done:
                self.rew_q.put(.99 * self.rew_q.get() + .01 * self.ep_rew)
                self.ep_rew = 0

            self.datas['dones'][startx + i] = 0
            if isinstance(action, np.ndarray):
                action = cuda_if(torch.from_numpy(action))
            self.datas['actions'][startx + i] = action
            state = next_state(self.env,
                               self.obs_deque,
                               obs=obs,
                               reset=reset,
                               preprocess=hyps['preprocess'])
            if i > 0:
                self.datas['next_states'][startx + i -
                                          1] = self.datas['states'][startx + i]

        endx = startx + n_tsteps - 1
        self.datas['next_states'][endx] = cuda_if(torch.FloatTensor(state))
        self.datas['dones'][endx] = 1.
        self.state_bookmark = state
        if h is not None:
            self.prev_h = h.data
コード例 #10
0
    def update_model(self, shared_data):
        """
        This function accepts the data collected from a rollout and performs Q value update iterations
        on the neural net.

        shared_data - dict of torch tensors with shared memory to collect data. Each 
                tensor contains indices from idx*n_tsteps to (idx+1)*n_tsteps
                Keys (assume string keys):
                    "states" - MDP states at each timestep t
                            type: FloatTensor
                            shape: (n_states, *state_shape)
                    "deltas" - gae deltas collected at timestep t+1
                            type: FloatTensor
                            shape: (n_states,)
                    "h_states" - Recurrent states at timestep t+1
                            type: FloatTensor
                            shape: (n_states, h_size)
                    "rewards" - Collects float rewards collected at each timestep t
                            type: FloatTensor
                            shape: (n_states,)
                    "dones" - Collects the dones collected at each timestep t
                            type: FloatTensor
                            shape: (n_states,)
                    "actions" - Collects actions performed at each timestep t
                            type: LongTensor
                            shape: (n_states,)
        """
        hyps = self.hyps
        net = self.net
        net.req_grads(True)

        states = shared_data['states']
        rewards = shared_data['rewards']
        dones = shared_data['dones']
        actions = shared_data['actions']
        deltas = shared_data['deltas']
        advs = cuda_if(
            discount(deltas.squeeze(), dones.squeeze(),
                     hyps['gamma'] * hyps['lambda_']))

        # Forward Pass
        if 'h_states' in shared_data:
            h_states = Variable(cuda_if(shared_data['h_states']))
            if hyps['use_bptt']:
                vals, logits = self.bptt(states, h_states, dones)
            else:
                vals, logits, _ = net(Variable(cuda_if(states)), h_states)
        else:
            vals, logits = net(Variable(cuda_if(states)))

        # Log Probabilities
        log_softs = F.log_softmax(logits, dim=-1)
        logprobs = log_softs[torch.arange(len(actions)).long(), actions]

        # Returns
        if hyps['use_nstep_rets']:
            returns = advs + vals.data.squeeze()
        else:
            returns = cuda_if(
                discount(rewards.squeeze(), dones.squeeze(), hyps['gamma']))

        # Advantages
        if hyps['norm_advs']:
            advs = (advs - advs.mean()) / (advs.std() + 1e-6)

        # A2C Losses
        pi_loss = -(logprobs.squeeze() * Variable(advs.squeeze())).mean()
        val_loss = hyps['val_coef'] * F.mse_loss(vals.squeeze(), returns)
        entr_loss = -hyps['entr_coef'] * (
            (log_softs * F.softmax(logits, dim=-1)).sum(-1)).mean()

        loss = pi_loss + val_loss - entr_loss
        loss.backward()
        self.norm = nn.utils.clip_grad_norm_(net.parameters(),
                                             hyps['max_norm'])
        self.optim.step()
        self.optim.zero_grad()

        self.info = {
            "Loss": loss.item(),
            "Pi_Loss": pi_loss.item(),
            "ValLoss": val_loss.item(),
            "Entropy": entr_loss.item(),
            "GradNorm": self.norm.item()
        }
        return self.info
コード例 #11
0
    def train(self, hyps): 
        """
        hyps - dictionary of required hyperparameters
            type: dict
        """

        # Initial settings
        if "randomizeObjs" in hyps:
            assert False, "you mean randomizeObs, not randomizeObjs"
        if "audibleTargs" in hyps and hyps['audibleTargs'] > 0:
            hyps['aud_targs'] = True
            if verbose: print("Using audible targs!")
        countOut = try_key(hyps, 'countOut', 0)
        if countOut and not hyps['endAtOrigin']:
            assert False, "endAtOrigin must be true for countOut setting"

        # Print Hyperparameters To Screen
        items = list(hyps.items())
        for k, v in sorted(items):
            print(k+":", v)

        # Make Save Files
        if "save_folder" in hyps:
            save_folder = hyps['save_folder']
        else:
            save_folder = "./saved_data/"

        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
        base_name = save_folder + hyps['exp_name']
        net_save_file = base_name+"_net.p"
        fwd_save_file = base_name+"_fwd.p"
        best_net_file = base_name+"_best.p"
        optim_save_file = base_name+"_optim.p"
        fwd_optim_file = base_name+"_fwdoptim.p"
        hyps['fwd_emb_file'] = base_name+"_fwdemb.p"
        if hyps['inv_model'] is not None:
            inv_save_file = base_name+"_invnet.p"
            reconinv_optim_file = base_name+"_reconinvoptim.p"
        else:
            inv_save_file = None
            reconinv_optim_file = None
        if hyps['recon_model'] is not None:
            recon_save_file = base_name+"_reconnet.p"
            reconinv_optim_file = base_name+"_reconinvoptim.p"
        else:
            recon_save_file = None
        log_file = base_name+"_log.txt"
        if hyps['resume']: log = open(log_file, 'a')
        else: log = open(log_file, 'w')
        for k, v in sorted(items):
            log.write(k+":"+str(v)+"\n")

        # Miscellaneous Variable Prep
        logger = Logger()
        shared_len = hyps['n_tsteps']*hyps['n_rollouts']
        float_params = dict()
        if "float_params" not in hyps:
            try:
                keys = hyps['game_keys']
                hyps['float_params'] = {k:try_key(hyps,k,0) for k in keys}
                if "minObjLoc" not in hyps:
                    hyps['float_params']["minObjLoc"] = 0.27
                    hyps['float_params']["maxObjLoc"] = 0.73
                float_params = hyps['float_params']
            except: pass
        env = SeqEnv(hyps['env_type'], hyps['seed'],
                                            worker_id=None,
                                            float_params=float_params)
        hyps['discrete_env'] = hasattr(env.action_space, "n")
        obs = env.reset()
        prepped = hyps['preprocess'](obs)
        hyps['state_shape'] = [hyps['n_frame_stack']*prepped.shape[0],
                              *prepped.shape[1:]]
        if not hyps['discrete_env']:
            action_size = int(np.prod(env.action_space.shape))
        elif hyps['env_type'] == "Pong-v0":
            action_size = 3
        else:
            action_size = env.action_space.n
        hyps['action_shift'] = (4-action_size)*(hyps['env_type']=="Pong-v0") 
        print("Obs Shape:,",obs.shape)
        print("Prep Shape:,",prepped.shape)
        print("State Shape:,",hyps['state_shape'])
        print("Num Samples Per Update:", shared_len)
        if not (hyps['n_cache_refresh'] <= shared_len or hyps['cache_size'] == 0):
            hyps['n_cache_refresh'] = shared_len
        print("Samples Wasted in Update:", shared_len % hyps['batch_size'])
        try: env.close()
        except: pass
        del env

        # Prepare Shared Variables
        shared_data = {
            'states': torch.zeros(shared_len,
                                *hyps['state_shape']).share_memory_(),
            'next_states': torch.zeros(shared_len,
                                *hyps['state_shape']).share_memory_(),
            'dones':torch.zeros(shared_len).share_memory_(),
            'rews':torch.zeros(shared_len).share_memory_(),
            'hs':torch.zeros(shared_len,hyps['h_size']).share_memory_(),
            'next_hs':torch.zeros(shared_len,hyps['h_size']).share_memory_()}
        if hyps['discrete_env']:
            shared_data['actions'] = torch.zeros(shared_len).long().share_memory_()
        else:
            shape = (shared_len, action_size)
            shared_data['actions']=torch.zeros(shape).float().share_memory_()
        shared_data = {k: cuda_if(v) for k,v in shared_data.items()}
        n_rollouts = hyps['n_rollouts']
        gate_q = mp.Queue(n_rollouts)
        stop_q = mp.Queue(n_rollouts)
        end_q = mp.Queue(1)
        reward_q = mp.Queue(1)
        reward_q.put(-1)

        # Make Runners
        runners = []
        for i in range(hyps['n_envs']):
            runner = Runner(shared_data, hyps, gate_q, stop_q,
                                                       end_q,
                                                       reward_q)
            runners.append(runner)

        # Make the Networks
        h_size = hyps['h_size']
        net = hyps['model'](hyps['state_shape'], action_size, h_size,
                                            bnorm=hyps['use_bnorm'],
                                            lnorm=hyps['use_lnorm'],
                                            discrete_env=hyps['discrete_env'])
        # Fwd Dynamics
        hyps['is_recurrent'] = hasattr(net, "fresh_h")
        intl_size = h_size+action_size + hyps['is_recurrent']*h_size
        if hyps['fwd_lnorm']:
            block = [nn.LayerNorm(intl_size)]
        block = [nn.Linear(intl_size, h_size), 
            nn.ReLU(), nn.Linear(h_size, h_size), 
            nn.ReLU(), nn.Linear(h_size, h_size)]
        fwd_net = nn.Sequential(*block)
        # Allows us to argue an h vector along with embedding to
        # forward func
        if hyps['is_recurrent']:
            fwd_net = CatModule(fwd_net) 
        if hyps['ensemble']:
            fwd_net = Ensemble(fwd_net)
        fwd_net = cuda_if(fwd_net)

        if hyps['inv_model'] is not None:
            inv_net = hyps['inv_model'](h_size, action_size)
            inv_net = cuda_if(inv_net)
        else:
            inv_net = None
        if hyps['recon_model'] is not None:
            recon_net = hyps['recon_model'](emb_size=h_size,
                                       img_shape=hyps['state_shape'],
                                       fwd_bnorm=hyps['fwd_bnorm'],
                                       deconv_ksizes=hyps['recon_ksizes'])
            recon_net = cuda_if(recon_net)
        else:
            recon_net = None
        if hyps['resume']:
            net.load_state_dict(torch.load(net_save_file))
            fwd_net.load_state_dict(torch.load(fwd_save_file))
            if inv_net is not None:
                inv_net.load_state_dict(torch.load(inv_save_file))
            if recon_net is not None:
                recon_net.load_state_dict(torch.load(recon_save_file))
        base_net = copy.deepcopy(net)
        net = cuda_if(net)
        net.share_memory()
        base_net = cuda_if(base_net)
        hyps['is_recurrent'] = hasattr(net, "fresh_h")

        # Start Data Collection
        print("Making New Processes")
        procs = []
        for i in range(len(runners)):
            proc = mp.Process(target=runners[i].run, args=(net,))
            procs.append(proc)
            proc.start()
            print(i, "/", len(runners), end='\r')
        for i in range(n_rollouts):
            gate_q.put(i)

        # Make Updater
        updater = Updater(base_net, fwd_net, hyps, inv_net, recon_net)
        if hyps['resume']:
            updater.optim.load_state_dict(torch.load(optim_save_file))
            updater.fwd_optim.load_state_dict(torch.load(fwd_optim_file))
            if inv_net is not None:
                updater.reconinv_optim.load_state_dict(torch.load(reconinv_optim_file))
        updater.optim.zero_grad()
        updater.net.train(mode=True)
        updater.net.req_grads(True)

        # Prepare Decay Precursors
        entr_coef_diff = hyps['entr_coef'] - hyps['entr_coef_low']
        epsilon_diff = hyps['epsilon'] - hyps['epsilon_low']
        lr_diff = hyps['lr'] - hyps['lr_low']
        gamma_diff = hyps['gamma_high'] - hyps['gamma']

        # Training Loop
        past_rews = deque([0]*hyps['n_past_rews'])
        last_avg_rew = 0
        best_rew_diff = 0
        best_avg_rew = -10000
        best_eval_rew = -10000
        ep_eval_rew = 0
        eval_rew = 0

        epoch = 0
        done_count = 0
        T = 0
        try:
            while T < hyps['max_tsteps']:
                basetime = time.time()
                epoch += 1

                # Collect data
                for i in range(n_rollouts):
                    stop_q.get()
                T += shared_len

                # Reward Stats
                avg_reward = reward_q.get()
                reward_q.put(avg_reward)
                last_avg_rew = avg_reward
                done_count += shared_data['dones'].sum().item()
                new_best = False
                if avg_reward > best_avg_rew and done_count > n_rollouts:
                    new_best = True
                    best_avg_rew = avg_reward
                    updater.save_model(best_net_file, fwd_save_file,
                                                         None, None)
                eval_rew = shared_data['rews'].mean()
                if eval_rew > best_eval_rew:
                    best_eval_rew = eval_rew
                    save_names = [net_save_file, fwd_save_file,
                                                 optim_save_file,
                                                 fwd_optim_file,
                                                 inv_save_file,
                                                 recon_save_file,
                                                 reconinv_optim_file]

                    for i in range(len(save_names)):
                        if save_names[i] is not None:
                            splt = save_names[i].split(".")
                            splt[0] = splt[0]+"_best"
                            save_names[i] = ".".join(splt)
                    updater.save_model(*save_names)
                s = "EvalRew: {:.5f} | BestEvalRew: {:.5f}"
                print(s.format(eval_rew, best_eval_rew))

                # Calculate the Loss and Update nets
                updater.update_model(shared_data)
                net.load_state_dict(updater.net.state_dict()) # update all collector nets
                
                # Resume Data Collection
                for i in range(n_rollouts):
                    gate_q.put(i)

                # Decay HyperParameters
                if hyps['decay_eps']:
                    updater.epsilon = (1-T/(hyps['max_tsteps']))*epsilon_diff + hyps['epsilon_low']
                    print("New Eps:", updater.epsilon)
                if hyps['decay_lr']:
                    new_lr = (1-T/(hyps['max_tsteps']))*lr_diff + hyps['lr_low']
                    updater.new_lr(new_lr)
                    print("New lr:", new_lr)
                if hyps['decay_entr']:
                    updater.entr_coef = entr_coef_diff*(1-T/(hyps['max_tsteps']))+hyps['entr_coef_low']
                    print("New Entr:", updater.entr_coef)
                if hyps['incr_gamma']:
                    updater.gamma = gamma_diff*(T/(hyps['max_tsteps']))+hyps['gamma']
                    print("New Gamma:", updater.gamma)

                # Periodically save model
                if epoch % 10 == 0 or epoch == 1:
                    updater.save_model(net_save_file, fwd_save_file,
                                                      optim_save_file,
                                                      fwd_optim_file,
                                                      inv_save_file,
                                                      recon_save_file,
                                                      reconinv_optim_file)

                # Print Epoch Data
                past_rews.popleft()
                past_rews.append(avg_reward)
                max_rew, min_rew = deque_maxmin(past_rews)
                print("Epoch", epoch, "– T =", T, "-- Folder:", base_name)
                if not hyps['discrete_env']:
                    s = ("{:.5f} | "*net.logsigs.shape[1])
                    s = s.format(*[x.item() for x in torch.exp(net.logsigs[0])])
                    print("Sigmas:", s)
                updater.print_statistics()
                avg_action = shared_data['actions'].float().mean().item()
                print("Grad Norm:",float(updater.norm),"– Avg Action:",avg_action,"– Best AvgRew:",best_avg_rew)
                print("Avg Rew:", avg_reward, "– High:", max_rew, "– Low:", min_rew, end='\n')
                updater.log_statistics(log, T, avg_reward, avg_action, best_avg_rew)
                updater.info['AvgRew'] = avg_reward
                updater.info['EvalRew'] = eval_rew
                logger.append(updater.info, x_val=T)

                # Check for memory leaks
                gc.collect()
                max_mem_used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                print("Time:", time.time()-basetime)
                if 'hyp_search_count' in hyps and hyps['hyp_search_count'] > 0 and hyps['search_id'] != None:
                    print("Search:", hyps['search_id'], "/", hyps['hyp_search_count'])
                print("Memory Used: {:.2f} memory\n".format(max_mem_used / 1024))
                if updater.info["VLoss"] == float('inf') or updater.norm == float('inf'):
                    break
        except KeyboardInterrupt:
            pass

        end_q.put(1)
        time.sleep(1)
        logger.make_plots(base_name)
        log.write("\nBestRew:"+str(best_avg_rew))
        log.close()
        # Close processes
        for p in procs:
            p.terminate()
        return best_avg_rew
コード例 #12
0
    action_size = 3
else:
    action_size = env.action_space.n
hyps['action_shift'] = (4 - action_size) * (hyps['env_type'] == "Pong-v0")
print("Obs Shape:,", obs.shape)
print("Prep Shape:,", prepped.shape)
print("State Shape:,", hyps['state_shape'])
del env

# Make Network
net = hyps['model'](hyps['state_shape'],
                    action_size,
                    h_size=hyps['h_size'],
                    bnorm=hyps['use_bnorm'])
net.load_state_dict(torch.load(file_name))
net = cuda_if(net)

# Prepare Shared Variables
shared_len = hyps['n_tsteps']
shared_data = {
    'states':
    cuda_if(torch.zeros(shared_len, *hyps['state_shape']).share_memory_()),
    'deltas':
    cuda_if(torch.zeros(shared_len).share_memory_()),
    'rewards':
    cuda_if(torch.zeros(shared_len).share_memory_()),
    'actions':
    torch.zeros(shared_len).long().share_memory_(),
    'dones':
    cuda_if(torch.zeros(shared_len).share_memory_())
}
コード例 #13
0
                        default='ep_reward.png',
                        help='path to save reward plot to')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    args = parser.parse_args()

    set_seed(args.seed)

    cuda = torch.cuda.is_available() and not args.no_cuda

    test_env = make_env(args.env_id, 0, args.seed)
    test_env = FrameStack(test_env, 4)

    policy = {'cnn': AtariCNN}[args.arch](venv.action_space.n)
    checkpoint = torch.load("./save/PPO_" + self.env_name + ".pt")
    policy.load_check_point(checkpoint["policy"])
    policy = cuda_if(policy, cuda)

    ob = self.test_env.reset()
    done = False
    ep_reward = 0
    last_action = np.array([-1])
    action_repeat = 0

    while not done:
        ob = np.array(ob)
        ob = torch.from_numpy(ob.transpose((2, 0, 1))).float().unsqueeze(0)
        ob = Variable(ob / 255., volatile=True)
        ob = cuda_if(ob, self.cuda)

        pi, v = policy(ob)
        _, action = torch.max(pi, dim=1)
コード例 #14
0
    def train(self, hyps):
        """
        hyps - dictionary of required hyperparameters
            type: dict
        """

        # Print Hyperparameters To Screen
        items = list(hyps.items())
        for k, v in sorted(items):
            print(k + ":", v)

        # Make Save Files
        if "save_folder" in hyps:
            save_folder = hyps['save_folder']
        else:
            save_folder = "./saved_data/"

        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
        base_name = save_folder + hyps['exp_name']
        net_save_file = base_name + "_net.p"
        best_net_file = base_name + "_best.p"
        optim_save_file = base_name + "_optim.p"
        log_file = base_name + "_log.txt"
        if hyps['resume']: log = open(log_file, 'a')
        else: log = open(log_file, 'w')
        for k, v in sorted(items):
            log.write(k + ":" + str(v) + "\n")

        # Miscellaneous Variable Prep
        logger = Logger()
        shared_len = hyps['n_tsteps'] * hyps['n_rollouts']
        env = gym.make(hyps['env_type'])
        obs = env.reset()
        prepped = hyps['preprocess'](obs)
        hyps['state_shape'] = [hyps['n_frame_stack']] + [*prepped.shape[1:]]
        if hyps['env_type'] == "Pong-v0":
            action_size = 3
        else:
            action_size = env.action_space.n
        hyps['action_shift'] = (4 - action_size) * (hyps['env_type']
                                                    == "Pong-v0")
        print("Obs Shape:,", obs.shape)
        print("Prep Shape:,", prepped.shape)
        print("State Shape:,", hyps['state_shape'])
        print("Num Samples Per Update:", shared_len)
        del env

        # Make Network
        net = hyps['model'](hyps['state_shape'],
                            action_size,
                            h_size=hyps['h_size'],
                            bnorm=hyps['use_bnorm'])
        if hyps['resume']:
            net.load_state_dict(torch.load(net_save_file))
        base_net = copy.deepcopy(net)
        net = cuda_if(net)
        net.share_memory()
        base_net = cuda_if(base_net)

        # Prepare Shared Variables
        shared_data = {
            'states':
            cuda_if(
                torch.zeros(shared_len, *hyps['state_shape']).share_memory_()),
            'deltas':
            cuda_if(torch.zeros(shared_len).share_memory_()),
            'rewards':
            cuda_if(torch.zeros(shared_len).share_memory_()),
            'actions':
            torch.zeros(shared_len).long().share_memory_(),
            'dones':
            cuda_if(torch.zeros(shared_len).share_memory_())
        }
        if net.is_recurrent:
            shared_data['h_states'] = cuda_if(
                torch.zeros(shared_len, net.h_size).share_memory_())
        n_rollouts = hyps['n_rollouts']
        gate_q = mp.Queue(n_rollouts)
        stop_q = mp.Queue(n_rollouts)
        reward_q = mp.Queue(1)
        reward_q.put(-1)

        # Make Runners
        runners = []
        for i in range(hyps['n_envs']):
            runner = Runner(shared_data, hyps, gate_q, stop_q, reward_q)
            runners.append(runner)

        # Start Data Collection
        print("Making New Processes")
        procs = []
        for i in range(len(runners)):
            proc = mp.Process(target=runners[i].run, args=(net, ))
            procs.append(proc)
            proc.start()
            print(i, "/", len(runners), end='\r')
        for i in range(n_rollouts):
            gate_q.put(i)

        # Make Updater
        updater = Updater(base_net, hyps)
        if hyps['resume']:
            updater.optim.load_state_dict(torch.load(optim_save_file))
        updater.optim.zero_grad()
        updater.net.train(mode=True)
        updater.net.req_grads(True)

        # Prepare Decay Precursors
        entr_coef_diff = hyps['entr_coef'] - hyps['entr_coef_low']
        lr_diff = hyps['lr'] - hyps['lr_low']
        gamma_diff = hyps['gamma_high'] - hyps['gamma']

        # Training Loop
        past_rews = deque([0] * hyps['n_past_rews'])
        last_avg_rew = 0
        best_avg_rew = -100
        epoch = 0
        T = 0
        while T < hyps['max_tsteps']:
            basetime = time.time()
            epoch += 1

            # Collect data
            for i in range(n_rollouts):
                stop_q.get()
            T += shared_len

            # Reward Stats
            avg_reward = reward_q.get()
            reward_q.put(avg_reward)
            last_avg_rew = avg_reward
            if avg_reward > best_avg_rew:
                best_avg_rew = avg_reward
                updater.save_model(best_net_file, None)

            # Calculate the Loss and Update nets
            updater.update_model(shared_data)
            net.load_state_dict(
                updater.net.state_dict())  # update all collector nets

            # Resume Data Collection
            for i in range(n_rollouts):
                gate_q.put(i)

            # Decay HyperParameters
            if hyps['decay_lr']:
                decay_factor = max((1 - T / (hyps['max_tsteps'])), 0)
                new_lr = decay_factor * lr_diff + hyps['lr_low']
                updater.new_lr(new_lr)
                print("New lr:", new_lr)
            if hyps['decay_entr']:
                decay_factor = max((1 - T / (hyps['max_tsteps'])), 0)
                updater.entr_coef = entr_coef_diff * decay_factor + hyps[
                    'entr_coef_low']
                print("New Entr:", updater.entr_coef)

            # Periodically save model
            if epoch % 10 == 0:
                updater.save_model(net_save_file, optim_save_file)

            # Print Epoch Data
            past_rews.popleft()
            past_rews.append(avg_reward)
            max_rew, min_rew = deque_maxmin(past_rews)
            rew_avg, rew_std = np.mean(past_rews), np.std(past_rews)
            updater.print_statistics()
            avg_action = shared_data['actions'].float().mean().item()
            print("Epoch", epoch, "– T =", T)
            print("Grad Norm:", float(updater.norm), "– Avg Action:",
                  avg_action, "– Best AvgRew:", best_avg_rew)
            print("Avg Rew:", avg_reward)
            print("Past " + str(hyps['n_past_rews']) + "Rews – High:", max_rew,
                  "– Low:", min_rew, "– Avg:", rew_avg, "– StD:", rew_std)
            updater.log_statistics(log, T, avg_reward, avg_action,
                                   best_avg_rew)
            updater.info['AvgRew'] = avg_reward
            logger.append(updater.info, x_val=T)

            # Check for memory leaks
            gc.collect()
            max_mem_used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            print("Time:", time.time() - basetime)
            if 'hyp_search_count' in hyps and hyps[
                    'hyp_search_count'] > 0 and hyps['search_id'] != None:
                print("Search:", hyps['search_id'], "/",
                      hyps['hyp_search_count'])
            print("Memory Used: {:.2f} memory\n".format(max_mem_used / 1024))

        logger.make_plots(base_name)
        log.write("\nBestRew:" + str(best_avg_rew))
        log.close()

        # Close processes
        for p in procs:
            p.terminate()

        return best_avg_rew
コード例 #15
0
prepped = hyps['preprocess'](obs)
hyps['state_shape'] = [hyps['n_frame_stack']] + [*prepped.shape[1:]]
if hyps['env_type'] == "Pong-v0":
    action_size = 3
else:
    action_size = env.action_space.n
hyps['action_shift'] = (4-action_size)*(hyps['env_type']=="Pong-v0") 
print("Obs Shape:,",obs.shape)
print("Prep Shape:,",prepped.shape)
print("State Shape:,",hyps['state_shape'])
del env

# Make Network
net = hyps['model'](hyps['state_shape'], action_size, h_size=hyps['h_size'], bnorm=hyps['use_bnorm'])
net.load_state_dict(torch.load(file_name))
net = cuda_if(net)

# Prepare Shared Variables
shared_len = hyps['n_tsteps']
shared_data = {'states': cuda_if(torch.zeros(shared_len, *hyps['state_shape']).share_memory_()),
        'next_states': cuda_if(torch.zeros(shared_len, *hyps['state_shape']).share_memory_()),
        'rewards': cuda_if(torch.zeros(shared_len).share_memory_()),
        'actions': torch.zeros(shared_len).long().share_memory_(),
        'dones': cuda_if(torch.zeros(shared_len).share_memory_())}
gate_q = mp.Queue(1)
stop_q = mp.Queue(1)
reward_q = mp.Queue(1)
reward_q.put(-1)

# Make Runner
runner = Runner(shared_data, hyps, gate_q, stop_q, reward_q)