Example #1
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if len(envs.observation_space.shape) == 3:
        actor_critic = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if args.cuda:
        actor_critic.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
        critic_optim = optim.Adam(critic.parameters(), lr=1e-4)
        gamma = 0.99
        tau = 0.001

    #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
    mem_buffer = ReplayBuffer()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)
            mem_buffer.add((pre_state, current_obs,
                            action_log_prob.data.cpu().numpy(), reward, done))
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        action, action_log_prob, states = actor_critic.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if True:
            state, next_state, action, reward, done = mem_buffer.sample(5)
            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, 6])
            next_q_values = critic_target(
                to_tensor(next_state, volatile=True),
                target_actor(to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True))[0])
            next_q_values.volatile = False
            target_q_batch = to_tensor(reward) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values
            critic.zero_grad()
            q_batch = critic(to_tensor(state), to_tensor(action))
            value_loss = criterion(q_batch, target_q_batch)
            value_loss.backward()
            critic_optim.step()
            actor_critic.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor_critic(to_tensor(state), to_tensor(state),
                             to_tensor(state))[0])
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            optimizer.step()
            soft_update(target_actor, actor_critic, tau)
            soft_update(critic_target, critic, tau)
        '''
        if args.algo in ['a2c', 'acktr']:
            action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))
            values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            #advantages = Variable(rollouts.returns[:-1]) - values
            advantages = rollouts.returns[:-1] - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages) * action_log_probs).mean()
            #action_loss = -(Variable(advantages.data) * action_log_probs).mean()


            optimizer.zero_grad()
            critic_optim.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
            critic_optim.step()
        '''
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Example #2
0
class DDPG:
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.AE = Actor(state_dim,action_dim).cuda()
        self.CE = Critic(state_dim,action_dim).cuda()
        self.AT = Actor(state_dim,action_dim).cuda()
        self.CT = Critic(state_dim,action_dim).cuda()
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.time_step = 0

        self.AE.load_state_dict(torch.load(MODEL_DIR+'/obs/actor_340000.pkl'))
        # self.AT.load_state_dict(torch.load(MODEL_DIR+'/actor_280000.pkl'))
        # self.CE.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl'))
        # self.CT.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl'))

        self.optimizer_a = torch.optim.Adam(self.AE.parameters(), lr=1e-4)
        self.optimizer_c = torch.optim.Adam(self.CE.parameters(), lr=1e-4)

    def train(self):
        self.AE.train()
        data = self.replay_buffer.get_batch(BATCH_SIZE)
        bs =  np.array([da[0] for da in data])
        ba =  np.array([da[1] for da in data])
        br =  np.array([da[2] for da in data])
        bs_ = np.array([da[3] for da in data])
        bd =  np.array([da[4] for da in data])

        bs = torch.FloatTensor(bs).cuda()
        ba = torch.FloatTensor(ba).cuda()
        br = torch.FloatTensor(br).cuda()
        bs_ = torch.FloatTensor(bs_).cuda()

        a_ = self.AT(bs_)
        #######################  NOTICE !!! #####################################
        #q1  = self.CE(bs,  a)  ###### here use action batch !!! for policy loss!!!
        q2  = self.CE(bs, ba) ###### here use computed batch !!! for value loss!!!
        ########################################################################
        q_ = self.CT(bs_, a_).detach()
        q_tar = torch.FloatTensor(BATCH_SIZE)
        for i in range(len(data)):
            if bd[i]:
                q_tar[i] = br[i]
            else:
                q_tar[i] = br[i]+GAMMA*q_[i]

        q_tar = q_tar.view(BATCH_SIZE,1).cuda()
        # minimize mse_loss of q2 and q_tar
        td_error = F.mse_loss(q2, q_tar.detach())  # minimize td_error
        self.CE.zero_grad()
        td_error.backward(retain_graph=True)
        self.optimizer_c.step()

        a = self.AE(bs)
        q1 = self.CE(bs, a)

        a_loss = -torch.mean(q1) # maximize q
        self.AE.zero_grad()
        a_loss.backward(retain_graph=True)
        self.optimizer_a.step()

        self.soft_replace()

    def soft_replace(self):
        for t,e in zip(self.AT.parameters(),self.AE.parameters()):
            t.data = (1-TAU)*t.data + TAU*e.data
        for t,e in zip(self.CT.parameters(),self.CE.parameters()):
            t.data = (1-TAU)*t.data + TAU*e.data

    def action(self, state):
        self.AE.eval()
        state_tensor = torch.FloatTensor(state).unsqueeze(0).cuda()  # add batch_sz=1
        ac_tensor = self.AE(state_tensor)
        ac = ac_tensor.squeeze(0).cpu().detach().numpy()
        return ac
    
    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)
        if self.replay_buffer.count() == REPLAY_START_SIZE:
            print('\n---------------Start training---------------')
        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.time_step += 1
            self.train()

        if self.time_step % 10000 == 0 and self.time_step > 0:
            torch.save(self.AE.state_dict(), MODEL_DIR + '/obs/actor_{}.pkl'.format(self.time_step))
            torch.save(self.CE.state_dict(), MODEL_DIR + '/obs/critic_{}.pkl'.format(self.time_step))
            print('Save model state_dict successfully in obs dir...')

        return self.time_step
Example #3
0
class DDPG(object):
    def __init__(self, nb_actions, nb_states, layer_norm, obs_norm, actor_lr,
                 critic_lr, SGLD_coef, noise_decay, lr_decay, batch_size,
                 discount, tau, pool_size, parameters_noise, action_noise,
                 SGLD_mode, pool_mode, with_cuda):

        self.nb_actions = nb_actions
        self.nb_states = nb_states
        self.layer_norm = layer_norm
        self.parameters_noise = parameters_noise
        self.action_noise = action_noise
        self.batch_size = batch_size
        self.discount = discount
        self.tau = tau
        self.pool_size = pool_size
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.SGLD_coef = SGLD_coef
        self.noise_coef = 1
        self.noise_decay = noise_decay
        self.lr_coef = 1
        self.lr_decay = lr_decay
        self.SGLD_mode = SGLD_mode
        self.pool_mode = pool_mode
        self.with_cuda = with_cuda

        self.actor = Actor(nb_states=self.nb_states,
                           nb_actions=self.nb_actions,
                           layer_norm=self.layer_norm)
        self.actor_target = Actor(nb_states=self.nb_states,
                                  nb_actions=self.nb_actions,
                                  layer_norm=self.layer_norm)
        self.critic = Critic(nb_states, nb_actions, layer_norm=self.layer_norm)
        self.critic_target = Critic(nb_states,
                                    nb_actions,
                                    layer_norm=self.layer_norm)
        if self.with_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        #self.actor_optim  = SGD(self.actor.parameters(), lr=actor_lr, momentum=0.9,weight_decay  = 0.01)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
        #self.critic_optim  = SGD(self.critic.parameters(), lr=critic_lr, momentum=0.9,weight_decay  = 0.01)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        self.memory = Memory(int(1e6), (nb_actions, ), (nb_states, ),
                             with_cuda)
        self.obs_norm = obs_norm
        if self.obs_norm:
            self.run_obs_norm = Run_Normalizer((nb_states, ), self.with_cuda)
        self.is_training = True

        if self.pool_size > 0:
            self.agent_pool = Agent_pool(self.pool_size)

        self.s_t = None
        self.a_t = None

    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        if self.is_training:
            self.memory.append(s_t, a_t, r_t, s_t1, done_t)
        if self.obs_norm:
            self.run_obs_norm.observe(s_t)
        self.s_t = s_t1

    def update(self):
        # Sample batch
        batch = self.memory.sample(self.batch_size)

        tensor_obs0 = batch['obs0']
        tensor_obs1 = batch['obs1']
        if self.obs_norm:
            tensor_obs0 = self.run_obs_norm.normalize(tensor_obs0)
            tensor_obs1 = self.run_obs_norm.normalize(tensor_obs1)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                tensor_obs1,
                self.actor_target(tensor_obs1),
            ])

            target_q_batch = batch['rewards'] + \
                self.discount*(1-batch['terminals1'])*next_q_values
        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([tensor_obs0, batch['actions']])
        value_loss = nn.functional.mse_loss(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if (self.SGLD_mode == 2) or (self.SGLD_mode == 3):
            SGLD_update(self.critic, self.critic_lr * self.lr_coef,
                        self.SGLD_coef)
        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic([tensor_obs0, self.actor(tensor_obs0)])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        if (self.SGLD_mode == 1) or (self.SGLD_mode == 3):
            SGLD_update(self.actor, self.actor_lr * self.lr_coef,
                        self.SGLD_coef)

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        return value_loss.item(), policy_loss.item()

    def apply_lr_decay(self):
        if self.lr_decay > 0:
            self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef +
                                                           self.lr_decay)
            self.critic_optim.param_groups[0][
                'lr'] = self.critic_lr * self.lr_coef

    def apply_noise_decay(self):
        if self.noise_decay > 0:
            self.noise_coef = self.noise_decay * self.noise_coef / (
                self.noise_coef + self.noise_decay)

    def select_action(self, random=False, s_t=None, if_noise=True):
        if random:
            action = np.random.uniform(-1., 1., self.nb_actions)
        else:
            if s_t is None: raise RuntimeError()
            s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False)
            if self.with_cuda:
                s_t = s_t.cuda()
            if self.obs_norm:
                s_t = self.run_obs_norm.normalize(s_t)
            with torch.no_grad():
                action = self.actor(s_t).cpu().numpy().squeeze(0)

            if if_noise & (self.action_noise is not None):
                action += self.is_training * max(self.noise_coef,
                                                 0) * self.action_noise()
        action = np.clip(action, -1., 1.)
        self.a_t = action
        return action

    def load_weights(self, output):
        self.actor = torch.load('{}/actor.pkl'.format(output))
        self.critic = torch.load('{}/critic.pkl'.format(output))
        if self.obs_norm:
            self.run_obs_norm = torch.load('{}/obs_norm.pkl'.format(output))

    def save_model(self, output):
        torch.save(self.actor, '{}/actor.pkl'.format(output))
        torch.save(self.critic, '{}/critic.pkl'.format(output))
        if self.obs_norm:
            torch.save(self.run_obs_norm, '{}/obs_norm.pkl'.format(output))

    def get_actor_buffer(self):
        buffer = io.BytesIO()
        torch.save(self.actor, buffer)
        return buffer

    def get_norm_param(self):
        return self.run_obs_norm.mean.cpu(), self.run_obs_norm.var.cpu()

    #TODO recode agent pool
    def append_actor(self):
        self.agent_pool.actor_append(self.actor.state_dict(),
                                     self.actor_target.state_dict())

    def pick_actor(self):
        actor, actor_target = self.agent_pool.get_actor()
        self.actor.load_state_dict(actor)
        self.actor_target.load_state_dict(actor_target)

    def append_critic(self):
        self.agent_pool.critic_append(self.critic.state_dict(),
                                      self.critic_target.state_dict())

    def pick_critic(self):
        critic, critic_target = self.agent_pool.get_critic()
        self.critic.load_state_dict(critic)
        self.critic_target.load_state_dict(critic_target)

    def append_actor_critic(self):
        self.agent_pool.actor_append(self.actor.state_dict(),
                                     self.actor_target.state_dict())
        self.agent_pool.critic_append(self.critic.state_dict(),
                                      self.critic_target.state_dict())

    def pick_actor_critic(self):
        actor, actor_target, critic, critic_target = self.agent_pool.get_agent(
        )
        self.actor.load_state_dict(actor)
        self.actor_target.load_state_dict(actor_target)
        self.critic.load_state_dict(critic)
        self.critic_target.load_state_dict(critic_target)

    def append_agent(self):
        if self.pool_mode == 1:
            self.append_actor()
        elif self.pool_mode == 2:
            self.append_critic()
        elif self.pool_mode == 3:
            self.append_actor_critic()

    def pick_agent(self):
        if self.pool_mode == 1:
            self.pick_actor()
        elif self.pool_mode == 2:
            self.pick_critic()
        elif self.pool_mode == 3:
            self.pick_actor_critic()

    def reset(self, obs):
        self.s_t = obs
        if self.action_noise is not None:
            self.action_noise.reset()
Example #4
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Example #5
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False):
        
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = discrete
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_w':args.init_w
        }
        self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = use_cuda
        # 
        if self.use_cuda: self.cuda()
        
    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # state_batch, action_batch, reward_batch, \
        # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ])
        
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        if train_actor == True:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        print("use cuda")
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1):
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
        # print(self.random_process.sample(), action)
        noise_level = noise_level * max(self.epsilon, 0)
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level)
        action = np.clip(action, -1., 1.)
        # print(action)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(
            torch.load('{}/actor.pkl'.format(output))
        )

        self.critic.load_state_dict(
            torch.load('{}/critic.pkl'.format(output))
        )


    def save_model(self, output):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor.pkl'.format(output)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic.pkl'.format(output)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()

    def seed(self,s):
        torch.manual_seed(s)
        if self.use_cuda:
            torch.cuda.manual_seed(s)
Example #6
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn
        }
        self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)]
        self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in
                              range(self.num_actor)]
        self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(self.actor_targets[i], self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = rpm(args.rmsize)  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = 0
        for i in range(self.num_actor):
            next_q_values = next_q_values + self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_targets[i](to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values = next_q_values / self.num_actor
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
                         self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        sum_policy_loss = 0
        for i in range(self.num_actor):
            self.actors[i].zero_grad()

            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actors[i](to_tensor(state_batch))
            ])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if train_actor:
                self.actor_optims[i].step()
            sum_policy_loss += policy_loss

            # Target update
            soft_update(self.actor_targets[i], self.actors[i], self.tau)

        soft_update(self.critic_target, self.critic, self.tau)

        return -sum_policy_loss / self.num_actor, value_loss

    def cuda(self):
        for i in range(self.num_actor):
            self.actors[i].cuda()
            self.actor_targets[i].cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        actions = []
        status = []
        tot_score = []
        for i in range(self.num_actor):
            action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0)
            noise_level = noise_level * max(self.epsilon, 0)
            action = action + self.random_process.sample() * noise_level
            status.append(s_t)
            actions.append(action)
            tot_score.append(0.)

        scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)])
        for j in range(self.num_actor):
            tot_score[j] += scores.data[j][0]
        best = np.array(tot_score).argmax()

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = actions[best]
        return actions[best]

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=0):
        if output is None: return
        for i in range(self.num_actor):
            actor = self.actors[i]
            actor_target = self.actor_targets[i]
            actor.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
            actor_target.load_state_dict(
                torch.load('{}/actor{}_{}.pkl'.format(output, num, i))
            )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cpu()
            self.critic.cpu()
        for i in range(self.num_actor):
            torch.save(
                self.actors[i].state_dict(),
                '{}/actor{}_{}.pkl'.format(output, num, i)
            )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            for i in range(self.num_actor):
                self.actors[i].cuda()
            self.critic.cuda()
Example #7
0
class DDPG(object):
    def __init__(self, memory, nb_status, nb_actions, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 actor_lr=1e-4, critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
Example #8
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_method': args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(
                    np.mean([
                        np.linalg.norm(p.grad.data.cpu().numpy().ravel())
                        for p in self.actor.parameters()
                    ]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad',
                                       mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)

        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        action = action * (1 - noise_level) + (self.random_process.sample() *
                                               noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Example #9
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions):
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE,
                                       window_length=HISTORY_LEN)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        # Hyper-parameters
        self.batch_size = BATCH_SIZE
        self.tau = TAU
        self.discount = GAMMA
        self.depsilon = 1.0 / DEPSILON

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        if USE_CUDA: self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])[:, 0]
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()

        torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0)
        for p in self.critic.parameters():
            p.data.add_(-CRITIC_LR, p.grad.data)
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0)
        for p in self.actor.parameters():
            p.data.add_(-ACTOR_LR, p.grad.data)
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0]
        ou = self.random_process.sample()

        prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou))
        action += self.is_training * max(self.epsilon, 0) * ou
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Example #10
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None     # Most recent state
        self.a_t = None     # Most recent action
        self.is_training = True

        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self, distribution='uniform'):
        '''
        Produce a random action
        '''
        if distribution == 'uniform':
            action = np.random.uniform(-1., 1., self.nb_actions)
            # set the action internally to the agent
            self.a_t = action
            return action
        else:
            raise ValueError('Distribution {} not defined'.format(distribution))

    def select_action(self, s_t, decay_epsilon=True, clip=None):
        '''
        Pick action according to actor network.
        :param s_t: current state s_t
        :param decay_epsilon: bool.
        :param clip: tuple to clip action values between
                     clip[0] and clip[1]. Default (-1, 1)
                     Set to false if not clip.
        '''
        # Set default for clip if None
        if clip is not False and clip is None:
                clip = (-1., 1.)

        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)

        # Add noise to the action.
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()

        if clip is not False:
            if len(clip) != 2:
                raise ValueError('Clip parameter malformed, received {}, \
                                  expected a size 2 tuple')
            action = np.clip(action, clip[0], clip[1])

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(
            torch.load('{}/actor.pkl'.format(output))
        )

        self.critic.load_state_dict(
            torch.load('{}/critic.pkl'.format(output))
        )

    def save_model(self, output):
        torch.save(
            self.actor.state_dict(),
            '{}/actor.pkl'.format(output)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic.pkl'.format(output)
        )

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Example #11
0
class DDPG:
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()

    def test(self, render=False, record=True, slow_t=0):
        dist, succ_rate = self.rollout(render=render,
                                       record=record,
                                       slow_t=slow_t)
        print('Final step distance: ', dist)

    def train(self):
        self.net_mode(train=True)
        tfirststart = time.time()
        epoch_episode_rewards = deque(maxlen=1)
        epoch_episode_steps = deque(maxlen=1)
        total_rollout_steps = 0
        for epoch in range(self.global_step, self.num_iters):
            episode_reward = 0
            episode_step = 0
            self.action_noise.reset()
            obs = self.env.reset()
            obs = obs[0]
            epoch_actor_losses = []
            epoch_critic_losses = []
            if self.use_her:
                ep_experi = {
                    'obs': [],
                    'act': [],
                    'reward': [],
                    'new_obs': [],
                    'ach_goals': [],
                    'done': []
                }
            for t_rollout in range(self.rollout_steps):
                total_rollout_steps += 1
                ran = np.random.random(1)[0]
                if self.pretrain_dir is None and epoch < self.warmup_iter or \
                        ran < self.random_prob:
                    act = self.random_action().flatten()
                else:
                    act = self.policy(obs).flatten()
                new_obs, r, done, info = self.env.step(act)
                ach_goals = new_obs[1].copy()
                new_obs = new_obs[0].copy()
                episode_reward += r
                episode_step += 1
                self.memory.append(obs, act, r * self.reward_scale, new_obs,
                                   ach_goals, done)
                if self.use_her:
                    ep_experi['obs'].append(obs)
                    ep_experi['act'].append(act)
                    ep_experi['reward'].append(r * self.reward_scale)
                    ep_experi['new_obs'].append(new_obs)
                    ep_experi['ach_goals'].append(ach_goals)
                    ep_experi['done'].append(done)
                if self.ob_norm:
                    self.obs_oms.update(new_obs)
                obs = new_obs
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            if self.use_her:
                for t in range(episode_step - self.k_future):
                    ob = ep_experi['obs'][t]
                    act = ep_experi['act'][t]
                    new_ob = ep_experi['new_obs'][t]
                    ach_goal = ep_experi['ach_goals'][t]
                    k_futures = np.random.choice(np.arange(
                        t + 1, episode_step),
                                                 self.k_future - 1,
                                                 replace=False)
                    k_futures = np.concatenate((np.array([t]), k_futures))
                    for future in k_futures:
                        new_goal = ep_experi['ach_goals'][future]
                        her_ob = np.concatenate(
                            (ob[:-self.goal_dim], new_goal), axis=0)
                        her_new_ob = np.concatenate(
                            (new_ob[:-self.goal_dim], new_goal), axis=0)
                        res = self.env.cal_reward(ach_goal.copy(), new_goal,
                                                  act)
                        her_reward, _, done = res
                        self.memory.append(her_ob, act,
                                           her_reward * self.reward_scale,
                                           her_new_ob, ach_goal.copy(), done)
            self.global_step += 1
            if epoch >= self.warmup_iter:
                for t_train in range(self.train_steps):
                    act_loss, cri_loss = self.train_net()
                    epoch_critic_losses.append(cri_loss)
                    epoch_actor_losses.append(act_loss)

            if epoch % self.log_interval == 0:
                tnow = time.time()
                stats = {}
                if self.ob_norm:
                    stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy())
                    stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy())
                stats['total_rollout_steps'] = total_rollout_steps
                stats['rollout/return'] = safemean(
                    [rew for rew in epoch_episode_rewards])
                stats['rollout/ep_steps'] = safemean(
                    [l for l in epoch_episode_steps])
                if epoch >= self.warmup_iter:
                    stats['actor_loss'] = np.mean(epoch_actor_losses)
                    stats['critic_loss'] = np.mean(epoch_critic_losses)
                stats['epoch'] = epoch
                stats['actor_lr'] = self.actor_optim.param_groups[0]['lr']
                stats['critic_lr'] = self.critic_optim.param_groups[0]['lr']
                stats['time_elapsed'] = tnow - tfirststart
                for name, value in stats.items():
                    logger.logkv(name, value)
                logger.dumpkvs()
            if (epoch == 0 or epoch >= self.warmup_iter) and \
                    self.save_interval and\
                    epoch % self.save_interval == 0 and \
                    logger.get_dir():
                mean_final_dist, succ_rate = self.rollout()
                logger.logkv('epoch', epoch)
                logger.logkv('test/total_rollout_steps', total_rollout_steps)
                logger.logkv('test/mean_final_dist', mean_final_dist)
                logger.logkv('test/succ_rate', succ_rate)

                tra_mean_dist, tra_succ_rate = self.rollout(train_test=True)
                logger.logkv('train/mean_final_dist', tra_mean_dist)
                logger.logkv('train/succ_rate', tra_succ_rate)

                # self.log_model_weights()
                logger.dumpkvs()
                if mean_final_dist < self.closest_dist:
                    self.closest_dist = mean_final_dist
                    is_best = True
                else:
                    is_best = False
                self.save_model(is_best=is_best, step=self.global_step)

    def train_net(self):
        batch_data = self.memory.sample(batch_size=self.batch_size)
        for key, value in batch_data.items():
            batch_data[key] = torch.from_numpy(value)
        obs0_t = batch_data['obs0']
        obs1_t = batch_data['obs1']
        obs0_t = self.normalize(obs0_t, self.obs_oms)
        obs1_t = self.normalize(obs1_t, self.obs_oms)
        obs0 = Variable(obs0_t).float().cuda()
        with torch.no_grad():
            vol_obs1 = Variable(obs1_t).float().cuda()

        rewards = Variable(batch_data['rewards']).float().cuda()
        actions = Variable(batch_data['actions']).float().cuda()
        terminals = Variable(batch_data['terminals1']).float().cuda()

        cri_q_val = self.critic(obs0, actions)
        with torch.no_grad():
            target_net_act = self.actor_target(vol_obs1)
            target_net_q_val = self.critic_target(vol_obs1, target_net_act)
            # target_net_q_val.volatile = False
            target_q_label = rewards
            target_q_label += self.gamma * target_net_q_val * (1 - terminals)
            target_q_label = target_q_label.detach()

        self.actor.zero_grad()
        self.critic.zero_grad()
        cri_loss = self.critic_loss(cri_q_val, target_q_label)
        cri_loss.backward()
        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.critic.parameters(),
                                          self.max_grad_norm)
        self.critic_optim.step()

        self.critic.zero_grad()
        self.actor.zero_grad()
        net_act = self.actor(obs0)
        net_q_val = self.critic(obs0, net_act)
        act_loss = -net_q_val.mean()
        act_loss.backward()

        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          self.max_grad_norm)
        self.actor_optim.step()

        self.soft_update(self.actor_target, self.actor, self.tau)
        self.soft_update(self.critic_target, self.critic, self.tau)
        return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy()

    def normalize(self, x, stats):
        if stats is None:
            return x
        return (x - stats.mean) / stats.std

    def denormalize(self, x, stats):
        if stats is None:
            return x
        return x * stats.std + stats.mean

    def net_mode(self, train=True):
        if train:
            self.actor.train()
            self.critic.train()
        else:
            self.actor.eval()
            self.critic.eval()

    def load_model(self, step=None, pretrain_dir=None):
        model_dir = self.model_dir
        if pretrain_dir is not None:
            ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth')
        else:
            if step is None:
                ckpt_file = os.path.join(model_dir, 'model_best.pth')
            else:
                ckpt_file = os.path.join(model_dir,
                                         'ckpt_{:08d}.pth'.format(step))
        if not os.path.isfile(ckpt_file):
            raise ValueError("No checkpoint found at '{}'".format(ckpt_file))
        mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file))
        checkpoint = torch.load(ckpt_file)
        if pretrain_dir is not None:
            actor_dict = self.actor.state_dict()
            critic_dict = self.critic.state_dict()
            actor_pretrained_dict = {
                k: v
                for k, v in checkpoint['actor_state_dict'].items()
                if k in actor_dict
            }
            critic_pretrained_dict = {
                k: v
                for k, v in checkpoint['critic_state_dict'].items()
                if k in critic_dict
            }
            actor_dict.update(actor_pretrained_dict)
            critic_dict.update(critic_pretrained_dict)
            self.actor.load_state_dict(actor_dict)
            self.critic.load_state_dict(critic_dict)
            self.global_step = 0
        else:
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.global_step = checkpoint['global_step']
        if step is None:
            mutils.print_yellow('Checkpoint step: {}'
                                ''.format(checkpoint['ckpt_step']))

        self.warmup_iter += self.global_step
        mutils.print_yellow('Checkpoint loaded...')

    def save_model(self, is_best, step=None):
        if step is None:
            step = self.global_step
        ckpt_file = os.path.join(self.model_dir,
                                 'ckpt_{:08d}.pth'.format(step))
        data_to_save = {
            'ckpt_step': step,
            'global_step': self.global_step,
            'actor_state_dict': self.actor.state_dict(),
            'actor_optimizer': self.actor_optim.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'critic_optimizer': self.critic_optim.state_dict()
        }

        mutils.print_yellow('Saving checkpoint: %s' % ckpt_file)
        torch.save(data_to_save, ckpt_file)
        if is_best:
            torch.save(data_to_save,
                       os.path.join(self.model_dir, 'model_best.pth'))

    def rollout(self, train_test=False, render=False, record=False, slow_t=0):
        test_conditions = self.env.train_test_conditions \
            if train_test else self.env.test_conditions
        done_num = 0
        final_dist = []
        episode_length = []
        for idx in range(test_conditions):
            if train_test:
                obs = self.env.train_test_reset(cond=idx)
            else:
                obs = self.env.test_reset(cond=idx)
            for t_rollout in range(self.rollout_steps):
                obs = obs[0].copy()
                act = self.policy(obs, stochastic=False).flatten()
                obs, r, done, info = self.env.step(act)
                if render:
                    self.env.render()
                    if slow_t > 0:
                        time.sleep(slow_t)
                if done:
                    done_num += 1
                    break
            if record:
                print('dist: ', info['dist'])
            final_dist.append(info['dist'])
            episode_length.append(t_rollout)
        final_dist = np.array(final_dist)
        mean_final_dist = np.mean(final_dist)
        succ_rate = done_num / float(test_conditions)
        if record:
            with open('./test_data.json', 'w') as f:
                json.dump(final_dist.tolist(), f)

            print('\nDist statistics:')
            print("Minimum: {0:9.4f} Maximum: {1:9.4f}"
                  "".format(np.min(final_dist), np.max(final_dist)))
            print("Mean: {0:9.4f}".format(mean_final_dist))
            print("Standard Deviation: {0:9.4f}".format(np.std(final_dist)))
            print("Median: {0:9.4f}".format(np.median(final_dist)))
            print("First quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 25)))
            print("Third quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 75)))
            print('Success rate:', succ_rate)
        if render:
            while True:
                self.env.render()
        return mean_final_dist, succ_rate

    def log_model_weights(self):
        for name, param in self.actor.named_parameters():
            logger.logkv('actor/' + name, param.clone().cpu().data.numpy())
        for name, param in self.actor_target.named_parameters():
            logger.logkv('actor_target/' + name,
                         param.clone().cpu().data.numpy())
        for name, param in self.critic.named_parameters():
            logger.logkv('critic/' + name, param.clone().cpu().data.numpy())
        for name, param in self.critic_target.named_parameters():
            logger.logkv('critic_target/' + name,
                         param.clone().cpu().data.numpy())

    def random_action(self):
        act = np.random.uniform(-1., 1., self.ac_dim)
        return act

    def policy(self, obs, stochastic=True):
        self.actor.eval()
        ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1)
        act = self.actor(ob)
        act = act.cpu().data.numpy()
        if stochastic:
            act = self.action_noise(act)
        self.actor.train()
        return act

    def cuda(self):
        self.critic.cuda()
        self.actor.cuda()
        if hasattr(self, 'critic_target'):
            self.critic_target.cuda()
            self.actor_target.cuda()
            self.critic_loss.cuda()

    def construct_optim(self, net, lr, weight_decay=None):
        if weight_decay is None:
            weight_decay = 0
        params = mutils.add_weight_decay([net], weight_decay=weight_decay)
        optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay)
        return optimizer

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Example #12
0
class UADDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.train_with_dropout = args.train_with_dropout
        self.dropout_p = args.dropout_p
        self.dropout_n = args.dropout_n
        self.print_var_count = 0
        self.action_std = np.array([])
        self.save_dir = args.output
        self.episode = 0

        # self.save_file = open(self.save_dir + '/std.txt', "a")

        print("train_with_dropout : " + str(self.train_with_dropout))
        print("Dropout p : " + str(self.dropout_p))
        print("Dropout n : " + str(self.dropout_n))

        # Create Actor and Critic Network
        net_cfg_actor = {
            'dropout_n': args.dropout_n,
            'dropout_p': args.dropout_p,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        net_cfg_critic = {
            'dropout_n': args.dropout_n,
            'dropout_p': args.dropout_p,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg_actor)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **net_cfg_actor)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg_critic)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **net_cfg_critic)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q batch
        # TODO : (1) Also apply epistemic and aleatoric uncertainty to both actor and critic target network
        # TOOD : (2) Is it proper to apply epistemic uncertainty to target network? If then, how to apply? Which network to choose for target? Let's think more about it after July.
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True))
        ])[:
           -1]  # x : next_state_batch, a : self.actor_target(next_state_batch)
        target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor(
            terminal_batch.astype(np.float)) * next_q_values

        #########################
        #  Critic update
        #########################
        self.critic.zero_grad()

        # TODO : (Completed) Add epistemic uncertainty for critic network
        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        # q_batch_mean, q_batch_var = select_q_with_dropout(state_batch, action_batch)
        # q_batch = self.critic.foward_with_dropout([to_tensor(state_batch), to_tensor(action_batch)])

        # TODO : (Completed) Add aleatoric uncertainty term from aleatoric uncertainty output of critic network (Add aleatoric uncertainty term in criterion)
        value_loss = criterion(q_batch, target_q_batch)
        # value_loss = AULoss(q_batch, target_q_batch)

        value_loss.backward()
        self.critic_optim.step()

        #########################
        #  Actor update
        #########################
        self.actor.zero_grad()

        # policy loss
        # TODO : (Completed) Add epistemic certainty term from aleatoric certainty output of policy network
        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])
        policy_loss = policy_loss.mean()
        # policy_loss = policy_loss.mean() + 1 / self.actor(to_tensor(state_batch)[-1])

        policy_loss.backward()
        self.actor_optim.step()

        #########################
        #  Target soft update
        #########################
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    # def select_action(self, s_t, decay_epsilon=True):
    #     action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
    #     action += self.is_training*max(self.epsilon, 0)*self.random_process.sample()
    #
    #     if decay_epsilon:
    #         self.epsilon -= self.depsilon
    #
    #     self.a_t = action
    #     return action

    def select_q_with_dropout(self, s_t, a_t):
        dropout_qs = np.arrary([])

        with torch.no_grad():
            for i in range(self.dropout_n):
                q_batch = to_numpy(
                    self.critic.forward_with_dropout([
                        to_tensor(s_t), to_tensor(a_t)
                    ]).squeeze(0)[:-1])  # ignore aleatoric variance term
                dropout_qs = np.append(dropout_qs, [q_batch])

        q_mean = torch.mean(dropout_qs)
        q_var = torch.var(dropout_qs)

        return q_mean, q_var

    def select_action_with_dropout(self, s_t, decay_epsilon=True):
        dropout_actions = np.array([])

        with torch.no_grad():
            for i in range(self.dropout_n):
                action = to_numpy(
                    self.actor.forward_with_dropout(to_tensor(np.array(
                        [s_t])))).squeeze(0)
                dropout_actions = np.append(dropout_actions, [action])

        if self.train_with_dropout:
            plt_action = to_numpy(
                self.actor.forward_with_dropout(to_tensor(np.array(
                    [s_t])))).squeeze(0)
            plt_action += self.is_training * max(
                self.epsilon, 0) * self.random_process.sample()

        else:
            plt_action = to_numpy(self.actor(to_tensor(np.array(
                [s_t])))).squeeze(0)
            plt_action += self.is_training * max(
                self.epsilon, 0) * self.random_process.sample()
        """
        UNFIXED RESET POINT for Mujoco
        """
        if self.print_var_count != 0 and (self.print_var_count + 1) % 999 == 0:
            # self.action_std = np.append(self.action_std, [np.std(dropout_actions)])

            with open(self.save_dir + "/std.txt", "a") as myfile:
                myfile.write(str(np.std(dropout_actions)) + '\n')
            with open(self.save_dir + "/mean.txt", "a") as myfile:
                myfile.write(str(np.mean(dropout_actions)) + '\n')

        if self.print_var_count % (1000 * 5) == 0:
            print("dropout actions std", np.std(dropout_actions),
                  "            ", "dir : ", str(self.save_dir))
        """
        FIXED RESET POINT for MCC
        """
        # if s_t[0] == -0.5 and s_t[1] == 0:
        #     # print("fixed dropout actions std", np.std(dropout_actions), "            ", "dir : ", str(self.save_dir))
        #     self.action_std = np.append(self.action_std, [np.std(dropout_actions)])
        #     # np.savetxt(self.save_dir + '/std.txt', self.action_std, fmt='%4.10f', delimiter=' ')
        #     with open(self.save_dir + "/std.txt", "a") as myfile:
        #         myfile.write(str(np.std(dropout_actions))+'\n')
        #     with open(self.save_dir + "/mean.txt", "a") as myfile:
        #         myfile.write(str(np.mean(dropout_actions))+'\n')

        if not (os.path.isdir(self.save_dir + "/episode/" +
                              str(self.episode))):
            os.makedirs(
                os.path.join(self.save_dir + "/episode/" + str(self.episode)))

        self.action_std = np.append(self.action_std, [np.std(dropout_actions)])
        with open(self.save_dir + "/episode/" + str(self.episode) + "/std.txt",
                  "a") as myfile:
            myfile.write(str(np.std(dropout_actions)) + '\n')

        with open(
                self.save_dir + "/episode/" + str(self.episode) + "/mean.txt",
                "a") as myfile:
            myfile.write(str(np.mean(dropout_actions)) + '\n')

        self.print_var_count = self.print_var_count + 1

        if decay_epsilon:
            self.epsilon -= self.depsilon

        # dropout_action = np.array([np.mean(dropout_actions)])

        self.a_t = plt_action

        return plt_action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder)
    # logger.save_args(args)

    # print ("---------------------------------------")
    # print ('Saving to', logger.save_folder)
    # print ("---------------------------------------")

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    ### for the number of processes to use
    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)
    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    ## ALE Environments : mostly has Discrete action_space type
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    ### shape==3 for ALE Environments : States are 3D (Image Pi)
    if len(envs.observation_space.shape) == 3:
        actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy,
                      envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
        baseline_target = Baseline_Critic(in_channels=4,
                                          num_actions=envs.action_space.n)

    if args.cuda:
        actor.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()
        baseline_target.cuda()

    actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr)
    baseline_optim = optim.Adam(actor.parameters(), lr=1e-4)
    tau_soft_update = 0.001

    mem_buffer = ReplayBuffer()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    for j in range(num_updates):

        temperature = 1.0

        ## num_steps = 5 as in A2C
        for step in range(args.num_steps):
            temperature = temperature / (step + 1)
            # Sample actions
            action, action_log_prob, states, dist_entropy = actor.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True), temperature,
                envs.action_space.n, args.num_processes)

            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)

            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, dist_entropy.data,
                            value.data, reward, masks)

        nth_step_return = rollouts.returns[0].cpu().numpy()
        current_state = rollouts.observations[0].cpu().numpy()
        nth_state = rollouts.observations[-1].cpu().numpy()
        current_action = rollouts.action_log_probs[0].cpu().numpy()
        current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy()

        mem_buffer.add((current_state, nth_state, current_action,
                        nth_step_return, done, current_action_dist_entropy))
        action, action_log_prob, states, dist_entropy = actor.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True), temperature,
            envs.action_space.n, args.num_processes)  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        bs_size = args.batch_size
        if len(mem_buffer.storage) >= bs_size:
            ##samples from the replay buffer
            state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample(
                bs_size)

            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, envs.action_space.n])

            #current Q estimate
            q_batch = critic(to_tensor(state), to_tensor(action))
            # target Q estimate
            next_state_action_probs = target_actor(
                to_tensor(next_state, volatile=True),
                to_tensor(next_state, volatile=True),
                to_tensor(next_state, volatile=True))

            next_q_values = critic_target(to_tensor(next_state, volatile=True),
                                          next_state_action_probs[1])
            next_q_values.volatile = False
            target_q_batch = to_tensor(returns) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values

            critic.zero_grad()
            value_loss = criterion(q_batch, target_q_batch)

            if args.gradient_penalty == True:
                gradients = torch.autograd.grad(value_loss,
                                                critic.parameters(),
                                                allow_unused=True,
                                                retain_graph=True,
                                                create_graph=True,
                                                only_inputs=True)[0]
                gradient_penalty = ((gradients.norm(2, dim=1) - 1)**
                                    2).mean() * args.lambda_grad_penalty
                gradient_penalty.backward()

            else:
                value_loss = criterion(q_batch, target_q_batch)
                value_loss.backward()

            critic_optim.step()

            actor.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor(to_tensor(state), to_tensor(state), to_tensor(state))[0])

            ### Soft trust region constraint for the actor
            current_action_probs = actor(to_tensor(state, volatile=False),
                                         to_tensor(state, volatile=False),
                                         to_tensor(state, volatile=False))[0]
            target_action_probs = target_actor(to_tensor(state, volatile=True),
                                               to_tensor(state, volatile=True),
                                               to_tensor(state,
                                                         volatile=True))[0]

            policy_regularizer = criterion(current_action_probs,
                                           target_action_probs)

            ## Actor update with entropy penalty
            policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \
                            + args.actor_kl_lambda * policy_regularizer

            if args.actor_several_updates == True:
                for p in range(args.actor_updates):
                    policy_loss.backward(retain_variables=True)
            else:
                policy_loss.backward()

            ##clipping of gradient norms
            gradient_norms = nn.utils.clip_grad_norm(actor.parameters(),
                                                     args.max_grad_norm)
            print("gradient_norms", gradient_norms)
            actor_optim.step()

            if args.second_order_grads == True:
                """
                Training the Baseline critic (f(s, \mu(s)))
                """
                baseline_target.zero_grad()
                ## f(s, \mu(s))
                current_baseline = baseline_target(
                    to_tensor(state),
                    actor(to_tensor(state), to_tensor(state),
                          to_tensor(state))[0])

                ## \grad f(s,a)
                grad_baseline_params = torch.autograd.grad(
                    current_baseline.mean(),
                    actor.parameters(),
                    retain_graph=True,
                    create_graph=True)

                ## MSE : (Q - f)^{2}
                baseline_loss = (q_batch.detach() -
                                 current_baseline).pow(2).mean()
                # baseline_loss.volatile=True

                actor.zero_grad()
                baseline_target.zero_grad()
                grad_norm = 0
                for grad_1, grad_2 in zip(grad_params, grad_baseline_params):
                    grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum()
                grad_norm = grad_norm.sqrt()

                ##Loss for the Baseline approximator (f)
                overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm
                overall_loss.backward()
                baseline_optim.step()

            soft_update(target_actor, actor, tau_soft_update)
            soft_update(critic_target, critic, tau_soft_update)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "" and len(
                mem_buffer.storage) >= bs_size:
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor
            if args.cuda:
                save_model = copy.deepcopy(actor).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0],
                        entropy_log_prob.mean()))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            all_value_loss = [value_loss.data.cpu().numpy()[0]]
            all_policy_loss = [policy_loss.data.cpu().numpy()[0]]

            # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss)
            # # logger.save()

        if args.vis and j % args.vis_interval == 0:

            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
class DDPG(object):
    def __init__(self, states_num, actions_num, args):

        self.states_num = states_num
        self.actions_num = actions_num
        self.epsilon = 1.0

        self.actor = Actor(self.states_num, self.actions_num, args.hidden1,
                           args.hidden2)
        self.actor_target = Actor(self.states_num, self.actions_num,
                                  args.hidden1, args.hidden2)

        self.critic = Critic(self.states_num, self.actions_num, args.hidden1,
                             args.hidden2)
        self.critic_target = Critic(self.states_num, self.actions_num,
                                    args.hidden1, args.hidden2)

        initialize(self.actor_target,
                   self.actor)  # Make sure target is with the same weight
        initialize(self.critic_target, self.critic)

        #Create replay buffer
        '''
        self.memory
        '''

        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.epsilon_decrease_factor = 1.0 / args.epsilon

        self.last_state = None
        self.last_action = None

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True))
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor(
            terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = nn.MSELoss(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        # Target update
        update(self.actor_target, self.actor, self.tau)
        update(self.critic_target, self.critic, self.tau)

    def observe(self, reward_now, state_next, done):
        self.memory.append(self.last_state, self.last_action, reward_now, done)
        self.last_state = state_next

    def random_action(self):
        action = np.random.uniform(-100., 100., self.actions_num)
        self.last_action = action
        return action
Example #15
0
class DDPG(object):
    def __init__(self,
                 memory,
                 nb_status,
                 nb_actions,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 actor_lr=1e-4,
                 critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic(
            [to_tensor(batch['obs0']),
             to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic(
            [to_tensor(batch['obs0']),
             self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
Example #16
0
class DDPG():
    def __init__(self,
                 env,
                 action_dim,
                 state_dim,
                 device,
                 critic_lr=3e-4,
                 actor_lr=3e-4,
                 gamma=0.99,
                 batch_size=100,
                 validate_steps=100,
                 max_episode_length=150):
        """
        param: env: An gym environment
        param: action_dim: Size of action space
        param: state_dim: Size of state space
        param: critic_lr: Learning rate of the critic
        param: actor_lr: Learning rate of the actor
        param: gamma: The discount factor
        param: batch_size: The batch size for training
        param: device: The device used for training
        param: validate_steps: Number of iterations after which we evaluate trained policy 
        """
        self.gamma = gamma
        self.batch_size = batch_size
        self.env = env
        self.device = device
        self.eval_env = deepcopy(env)
        self.validate_steps = validate_steps
        self.max_episode_length = max_episode_length

        # actor and actor_target where both networks have the same initial weights
        self.actor = Actor(state_dim=state_dim,
                           action_dim=action_dim).to(self.device)
        self.actor_target = deepcopy(self.actor)

        # critic and critic_target where both networks have the same initial weights
        self.critic = Critic(state_dim=state_dim,
                             action_dim=action_dim).to(self.device)
        self.critic_target = deepcopy(self.critic)

        # Optimizer for the actor and critic
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)

        # Replay buffer
        self.ReplayBuffer = ReplayBuffer(buffer_size=10000, init_length=1000, state_dim=state_dim, \
                                         action_dim=action_dim, env=env, device = device)

    def update_target_networks(self):
        """
        A function to update the target networks
        """
        weighSync(self.actor_target, self.actor)
        weighSync(self.critic_target, self.critic)

    def update_network(self, batch):
        """
        A function to update the function just once
        """

        # Sample and parse batch
        state, action, reward, state_next, done = self.ReplayBuffer.batch_sample(
            batch)

        # Predicting the next action and q_value
        action_next = self.actor_target(state_next)
        q_next = self.critic_target(state_next, action_next)
        target_q = reward + (self.gamma * done * q_next)

        q = self.critic(state, action)

        # Critic update
        self.critic.zero_grad()
        value_loss = F.mse_loss(q, target_q)
        value_loss.backward()
        self.optimizer_critic.step()

        # Actor update
        self.actor.zero_grad()
        policy_loss = -self.critic(state, self.actor(state)).mean()
        policy_loss.backward()
        self.optimizer_actor.step()

        # Target update
        self.update_target_networks()
        return value_loss.item(), policy_loss.item()

    def select_action(self, state, isEval):

        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        action = self.actor(state).squeeze(0).detach()
        if isEval:
            return action.cpu().numpy()
        action += torch.normal(0, 0.1, size=action.shape).to(self.device)
        action = torch.clamp(action, -1., 1.).cpu().numpy()
        return action

    def train(self, num_steps):
        """
        Train the policy for the given number of iterations
        :param num_steps:The number of steps to train the policy for
        """
        value_losses, policy_losses, validation_reward, validation_steps = [],[],[],[]

        step, episode, episode_steps, episode_reward, state = 0, 0, 0, 0., None

        while step < num_steps:
            # reset if it is the start of episode
            if state is None:
                state = deepcopy(self.env.reset())

            action = self.select_action(state, False)
            # env response with next_state, reward, terminate_info
            state_next, reward, done, _ = self.env.step(action)
            state_next = deepcopy(state_next)

            if self.max_episode_length and episode_steps >= self.max_episode_length - 1:
                done = True

            # observe and store in replay buffer
            self.ReplayBuffer.buffer_add(
                Exp(state=state,
                    action=action,
                    reward=reward,
                    state_next=state_next,
                    done=done))

            # update policy based on sampled batch
            batch = self.ReplayBuffer.buffer_sample(self.batch_size)
            value_loss, policy_loss = self.update_network(batch)
            value_losses.append(value_loss)
            policy_losses.append(policy_loss)

            # evaluate
            if step % self.validate_steps == 0:
                validate_reward, steps = self.evaluate()
                validation_reward.append(validate_reward)
                validation_steps.append(steps)
                print(
                    "[Eval {:06d}/{:06d}] Steps: {:06d}, Episode Reward:{:04f}"
                    .format(step, int(num_steps), steps, validate_reward))

            # update
            step += 1
            episode_steps += 1
            episode_reward += reward
            state = deepcopy(state_next)

            if done:  # reset at the end of episode
                #print("[Train {:06d}/{:06d}] - Episode Reward:{:04f} ".format(step, num_steps, step, episode_reward))
                episode_steps, episode_reward, state = 0, 0., None
                episode += 1

        return value_losses, policy_losses, validation_reward, validation_steps

    def evaluate(self):
        """
        Evaluate the policy trained so far in an evaluation environment
        """
        state, done, total_reward, steps = self.eval_env.reset(), False, 0., 0

        while not done:
            action = self.select_action(state, True)
            state_next, reward, done, _ = self.eval_env.step(action)
            total_reward += reward
            steps += 1
            state = state_next
        return total_reward / steps, steps
Example #17
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
def training(opt):

    # ~~~~~~~~~~~~~~~~~~~ hyper parameters ~~~~~~~~~~~~~~~~~~~ #

    EPOCHS = opt.epochs
    CHANNELS = 1
    H, W = 64, 64
    work_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    FEATURE_D = 128
    Z_DIM = 100
    BATCH_SIZE = opt.batch_size

    # ~~~~~~~~~~~~~~~~~~~ as per WGAN paper ~~~~~~~~~~~~~~~~~~~ #

    lr = opt.lr
    CRITIC_TRAIN_STEPS = 5
    WEIGHT_CLIP = 0.01

    print(f"Epochs: {EPOCHS}| lr: {lr}| batch size {BATCH_SIZE}|" +
          f" device: {work_device}")

    # ~~~~~~~~~~~ creating directories for weights ~~~~~~~~~~~ #

    if opt.logs:
        log_dir = Path(f'{opt.logs}').resolve()
        if log_dir.exists():
            shutil.rmtree(str(log_dir))

    if opt.weights:
        Weight_dir = Path(f'{opt.weights}').resolve()
        if not Weight_dir.exists():
            Weight_dir.mkdir()

    # ~~~~~~~~~~~~~~~~~~~ loading the dataset ~~~~~~~~~~~~~~~~~~~ #

    trans = transforms.Compose([
        transforms.Resize((H, W)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, ), (0.5, ))
    ])

    MNIST_data = MNIST(str(opt.data_dir), True, transform=trans, download=True)

    loader = DataLoader(
        MNIST_data,
        BATCH_SIZE,
        True,
        num_workers=2,
        pin_memory=True,
    )

    # ~~~~~~~~~~~~~~~~~~~ creating tensorboard variables ~~~~~~~~~~~~~~~~~~~ #

    writer_fake = SummaryWriter(f"{str(log_dir)}/fake")
    writer_real = SummaryWriter(f"{str(log_dir)}/real")
    loss_writer = SummaryWriter(f"{str(log_dir)}/loss")

    # ~~~~~~~~~~~~~~~~~~~ loading the model ~~~~~~~~~~~~~~~~~~~ #

    critic = Critic(img_channels=CHANNELS, feature_d=FEATURE_D).to(work_device)
    gen = Faker(Z_DIM, CHANNELS, FEATURE_D).to(work_device)

    if opt.resume:
        if Path(Weight_dir / 'critic.pth').exists():

            critic.load_state_dict(
                torch.load(str(Weight_dir / 'critic.pth'),
                           map_location=work_device))

        if Path(Weight_dir / 'generator.pth').exists():

            gen.load_state_dict(
                torch.load(str(Weight_dir / 'generator.pth'),
                           map_location=work_device))

    # ~~~~~~~~~~~~~~~~~~~ create optimizers ~~~~~~~~~~~~~~~~~~~ #

    critic_optim = optim.RMSprop(critic.parameters(), lr)
    gen_optim = optim.RMSprop(gen.parameters(), lr)

    # ~~~~~~~~~~~~~~~~~~~ training loop ~~~~~~~~~~~~~~~~~~~ #

    # loss variables
    C_loss_prev = math.inf
    G_loss_prev = math.inf
    C_loss = 0
    G_loss = 0
    C_loss_avg = 0
    G_loss_avg = 0

    print_gpu_details()

    # setting the models to train mode
    critic.train()
    gen.train()

    for epoch in range(EPOCHS):

        # reset the average loss to zero
        C_loss_avg = 0
        G_loss_avg = 0

        print_memory_utilization()

        for batch_idx, (real, _) in enumerate(tqdm(loader)):

            real = real.to(work_device)
            fixed_noise = torch.rand(real.shape[0], Z_DIM, 1,
                                     1).to(work_device)

            # ~~~~~~~~~~~~~~~~~~~ critic loop ~~~~~~~~~~~~~~~~~~~ #
            with torch.no_grad():
                fake = gen(fixed_noise)  # dim of (N,1,W,H)

            for _ in range(CRITIC_TRAIN_STEPS):

                critic.zero_grad()
                # ~~~~~~~~~~~ weight cliping as per WGAN paper ~~~~~~~~~~ #

                for p in critic.parameters():
                    p.data.clamp_(-WEIGHT_CLIP, WEIGHT_CLIP)

                # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ #

                # make it one dimensional array
                real_predict = critic(real).view(-1)
                # make it one dimensional array
                fake_predict = critic(fake.detach()).view(-1)

                # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ #

                C_loss = -(torch.mean(fake_predict) - torch.mean(real_predict))
                C_loss_avg += C_loss

                # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ #

                C_loss.backward()
                critic_optim.step()

            # ~~~~~~~~~~~~~~~~~~~ generator loop ~~~~~~~~~~~~~~~~~~~ #
            gen.zero_grad()

            # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ #

            # make it one dimensional array
            fake_predict = critic(fake).view(-1)

            # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ #

            G_loss = -(torch.mean(fake_predict))
            G_loss_avg += G_loss

            # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ #

            G_loss.backward()
            gen_optim.step()

            # ~~~~~~~~~~~~~~~~~~~ loading the tensorboard ~~~~~~~~~~~~~~~~~~~ #

            # will execute at every 50 steps
            if (batch_idx + 1) % 50 == 0:

                # ~~~~~~~~~~~~ calculate average loss ~~~~~~~~~~~~~ #

                C_loss_avg_ = C_loss_avg / (CRITIC_TRAIN_STEPS * batch_idx)
                G_loss_avg_ = G_loss_avg / (batch_idx)

                print(f"Epoch [{epoch}/{EPOCHS}] | batch size {batch_idx}" +
                      f"Loss C: {C_loss_avg_:.4f}, loss G: {G_loss_avg_:.4f}")

                # ~~~~~~~~~~~~ send data to tensorboard ~~~~~~~~~~~~~ #

                with torch.no_grad():
                    critic.eval()
                    gen.eval()
                    if BATCH_SIZE > 32:
                        fake = gen(fixed_noise[:32]).reshape(
                            -1, CHANNELS, H, W)
                        data = real[:32].reshape(-1, CHANNELS, H, W)
                    else:
                        fake = gen(fixed_noise).reshape(-1, CHANNELS, H, W)
                        data = real.reshape(-1, CHANNELS, H, W)

                    img_grid_fake = torchvision.utils.make_grid(fake,
                                                                normalize=True)
                    img_grid_real = torchvision.utils.make_grid(data,
                                                                normalize=True)

                    step = (epoch + 1) * (batch_idx + 1)

                    writer_fake.add_image("Mnist Fake Images",
                                          img_grid_fake,
                                          global_step=step)
                    writer_real.add_image("Mnist Real Images",
                                          img_grid_real,
                                          global_step=step)
                    loss_writer.add_scalar('Critic', C_loss, global_step=step)
                    loss_writer.add_scalar('generator',
                                           G_loss,
                                           global_step=step)

                # changing back the model to train mode
                critic.train()
                gen.train()

        # ~~~~~~~~~~~~~~~~~~~ saving the weights ~~~~~~~~~~~~~~~~~~~ #

        if opt.weights:
            if C_loss_prev > C_loss_avg:
                C_loss_prev = C_loss_avg
                weight_path = str(Weight_dir / 'critic.pth')
                torch.save(critic.state_dict(), weight_path)

            if G_loss_prev > G_loss_avg:
                G_loss_prev = G_loss_avg
                weight_path = str(Weight_dir / 'generator.pth')
                torch.save(gen.state_dict(), weight_path)
Example #19
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        if self.pic:
            self.nb_status = args.pic_status

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn
        }
        if args.pic:
            self.cnn = CNN(3, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actors = [
            Actor(self.nb_status, self.nb_actions)
            for _ in range(self.num_actor)
        ]
        self.actor_targets = [
            Actor(self.nb_status, self.nb_actions)
            for _ in range(self.num_actor)
        ]
        self.actor_optims = [
            Adam(self.actors[i].parameters(), lr=args.prate)
            for i in range(self.num_actor)
        ]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(
                self.actor_targets[i],
                self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self, train_actor=True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            print('label 1')
            print('size = ', state_batch.shape)
            state_batch = self.cnn(state_batch)
            print('label 2')
            next_state_batch = np.array(
                [self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn(next_state_batch)
            next_q_values = self.critic_target(
                [next_state_batch,
                 self.actor_target(next_state_batch)])
        else:
            index = np.random.randint(low=0, high=self.num_actor)
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_targets[index](to_tensor(next_state_batch,
                                                    volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic(
                [to_tensor(state_batch),
                 to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        sum_policy_loss = 0
        for i in range(self.num_actor):
            self.actors[i].zero_grad()

            policy_loss = -self.critic([
                to_tensor(state_batch), self.actors[i](to_tensor(state_batch))
            ])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if train_actor:
                self.actor_optims[i].step()
            sum_policy_loss += policy_loss

            # Target update
            soft_update(self.actor_targets[i], self.actors[i], self.tau)

        soft_update(self.critic_target, self.critic, self.tau)

        return -sum_policy_loss / self.num_actor, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        for i in range(self.num_actor):
            self.actors[i].cuda()
            self.actor_targets[i].cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        actions = []
        status = []
        tot_score = []
        for i in range(self.num_actor):
            action = to_numpy(self.actors[i](to_tensor(
                np.array([s_t]), volatile=True))).squeeze(0)
            noise_level = noise_level * max(self.epsilon, 0)
            action = action + self.random_process.sample() * noise_level
            status.append(s_t)
            actions.append(action)
            tot_score.append(0.)

        scores = self.critic([
            to_tensor(np.array(status), volatile=True),
            to_tensor(np.array(actions), volatile=True)
        ])
        for j in range(self.num_actor):
            tot_score[j] += scores.data[j][0]
        best = np.array(tot_score).argmax()

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = actions[best]
        return actions[best]

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=0):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Example #20
0
for epoch in range(num_epochs):
    for batch_idx, (real, labels) in enumerate(loader):
        real = real.to(device)
        labels = labels.to(device)

        # For Critic
        for _ in range(critic_n):
            noise = torch.randn((batch_size, z_dim, 1, 1)).to(device)
            fake = gen(noise, labels)
            critic_real = critic(real, labels)
            critic_fake = critic(fake, labels)
            gp = gradient_penalty(critic, real, labels, fake, device)
            loss_critic = -(torch.mean(critic_real) - torch.mean(critic_fake) +
                            lambda_gp * gp)

            critic.zero_grad()
            loss_critic.backward(retain_graph=True)
            opt_critic.step()

        # For Generator
        output = critic(fake, labels).view(-1)
        loss_gen = -torch.mean(output)
        gen.zero_grad()
        loss_gen.backward()
        opt_gen.step()

        if batch_idx == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                      Loss D: {lossD:.4f}, loss G: {lossG:.4f}")
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)

    def act(self, states, perturb_mode=True, train_mode=True):
        """Returns actions for given state as per current policy."""
        if not train_mode:
            self.actor_local.eval()
            if self.use_psn:
                self.actor_perturbed.eval()

        with torch.no_grad():
            states = torch.from_numpy(states).float().to(device)
            actor = self.actor_perturbed if (
                self.use_psn and perturb_mode) else self.actor_local
            actions = actor(states).cpu().numpy()[0]

        if train_mode:
            actions += self.action_noise.sample()

        self.actor_local.train()
        if self.use_psn:
            self.actor_perturbed.train()

        return np.clip(actions, -1, 1)

    def perturb_actor_parameters(self):
        """Apply parameter space noise to actor model, for exploration"""
        policy_update(self.actor_local, self.actor_perturbed, 1.0)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape)
            if use_cuda:
                random = random.cuda()
            param += random * self.param_noise.current_stddev

    def reset(self):
        self.action_noise.reset()
        if self.use_psn:
            self.perturb_actor_parameters()

    def step(self, experience, priority=0.0):
        self.buffer.push(experience)
        self.i_step += 1
        if len(self.buffer) > self.batch_size:
            if self.i_step % self.learn_every == 0:
                self.learn(priority)
            if self.i_step % self.update_every == 0:
                self.update(
                )  # soft update the target network towards the actual networks

    def learn(self, priority=0.0):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        if self.use_per:
            (states, actions, rewards, states_next,
             dones), batch_idx = self.buffer.sample(priority)
        else:
            states, actions, rewards, states_next, dones = self.buffer.sample()

        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next = self.actor_target(states_next)
            Q_targets_next = self.critic_target(states_next, actions_next)
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # ---------------------------- update critic ---------------------------- #
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_local.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_local.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if self.use_per:
            Q_error = Q_expected - Q_targets
            new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy()
            self.buffer.update_deltas(batch_idx, new_deltas)

    def update(self):
        """soft update targets"""
        self.i_updates += 1
        policy_update(self.actor_local, self.actor_target, self.tau)
        policy_update(self.critic_local, self.critic_target, self.tau)

    def save_model(self, model_dir, session_name, i_episode, best):

        filename = os.path.join(
            model_dir,
            f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt')
        filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt')
        save_dict_list = []
        save_dict = {
            'actor': self.actor_local.state_dict(),
            'actor_optim_params': self.actor_optimizer.state_dict(),
            'critic': self.critic_local.state_dict(),
            'critic_optim_params': self.critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)
        torch.save(save_dict_list, filename)
        copyfile(filename, filename_best)

    def postprocess(self, t_step):
        if self.use_psn and t_step > 0:
            perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail(
                t_step)
            unperturbed_actions = self.act(np.array(perturbed_states), False,
                                           False)
            diff = np.array(perturbed_actions) - unperturbed_actions
            mean_diff = np.mean(np.square(diff), axis=0)
            dist = sqrt(np.mean(mean_diff))
            self.param_noise.adapt(dist)
Example #22
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):
        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        net_config = {
            'hidden1' : args.hidden1,
            'hidden2' : args.hidden2
        }

        # Actor and Critic initialization
        self.actor = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr)

        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)

        # Replay Buffer and noise
        self.memory = ReplayBuffer(args.memory_size)
        self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions))

        self.last_state = None
        self.last_action = None

        # Hyper parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount

        # CUDA
        self.use_cuda = args.cuda
        if self.use_cuda:
            self.cuda()

    def cuda(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic.to(device)
        self.critic_target.to(device)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def reset(self, obs):
        self.last_state = obs
        self.noise.reset()

    def observe(self, reward, state, done):
        self.memory.append([self.last_state, self.last_action, reward, state, done])
        self.last_state = state

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.last_action = action
        return action.argmax() if self.discrete else action

    def select_action(self, state, apply_noise=False):
        self.eval()
        action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0)
        self.train()
        if apply_noise:
            action = action + self.noise.sample()
        action = np.clip(action, -1., 1.)
        self.last_action = action
        #print('action:', action, 'output:', action.argmax())
        return action.argmax() if self.discrete else action

    def update_policy(self):
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)
        state = to_tensor(np.array(state_batch), device=device)
        action = to_tensor(np.array(action_batch), device=device)
        next_state = to_tensor(np.array(next_state_batch), device=device)

        # compute target Q value
        next_q_value = self.critic_target([next_state, self.actor_target(next_state)])
        target_q_value = to_tensor(reward_batch, device=device) \
                         + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value

        # Critic and Actor update
        self.critic.zero_grad()
        with torch.set_grad_enabled(True):
            q_values = self.critic([state, action])
            critic_loss = criterion(q_values, target_q_value.detach())
            critic_loss.backward()
            self.critic_optim.step()

        self.actor.zero_grad()
        with torch.set_grad_enabled(True):
            policy_loss = -self.critic([state.detach(), self.actor(state)]).mean()
            policy_loss.backward()
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean())

    def save_model(self, output, num=1):
        if self.use_cuda:
            self.actor.to(torch.device("cpu"))
            self.critic.to(torch.device("cpu"))
        torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.to(device)
            self.critic.to(device)

    def load_model(self, output, num=1):
        self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num)))
        if self.use_cuda:
            self.cuda()
Example #23
0
File: ddpg.py Project: YuanyeMa/RL
class DDPGAgent:
    def __init__(self,
                 plot=True,
                 seed=1,
                 env: gym.Env = None,
                 batch_size=128,
                 learning_rate_actor=0.001,
                 learning_rate_critic=0.001,
                 weight_decay=0.01,
                 gamma=0.999):

        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.batch_size = batch_size
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.weight_decay = weight_decay
        self.gamma = gamma
        self.tau = 0.001

        self._to_tensor = util.to_tensor
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        self.actor = Actor(self.state_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.state_dim,
                                  self.action_dim).to(self.device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                self.learning_rate_actor,
                                                weight_decay=self.weight_decay)

        self.critic = Critic(self.state_dim, self.action_dim).to(self.device)
        self.target_critic = Critic(self.state_dim,
                                    self.action_dim).to(self.device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(),
            self.learning_rate_critic,
            weight_decay=self.weight_decay)

        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        self.t = 0

    def _learn_from_memory(self, memory):
        ''' 从记忆学习,更新两个网络的参数
        '''
        # 随机获取记忆里的Transition
        trans_pieces = memory.sample(self.batch_size)
        s0 = np.vstack([x.state for x in trans_pieces])
        a0 = np.vstack([x.action for x in trans_pieces])
        r1 = np.vstack([x.reward for x in trans_pieces])
        s1 = np.vstack([x.next_state for x in trans_pieces])
        terminal_batch = np.vstack([x.is_done for x in trans_pieces])

        # 优化评论家网络参数
        s1 = self._to_tensor(s1, device=self.device)
        s0 = self._to_tensor(s0, device=self.device)

        next_q_values = self.target_critic.forward(
            state=s1, action=self.target_actor.forward(s1)).detach()
        target_q_batch = self._to_tensor(r1, device=self.device) + \
            self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values
        q_batch = self.critic.forward(s0,
                                      self._to_tensor(a0, device=self.device))

        # 计算critic的loss 更新critic网络参数
        loss_critic = F.mse_loss(q_batch, target_q_batch)
        #self.critic_optimizer.zero_grad()
        self.critic.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # 反向传播,以某状态的价值估计为策略目标函数
        loss_actor = -self.critic.forward(s0, self.actor.forward(s0))  # Q的梯度上升
        loss_actor = loss_actor.mean()
        self.actor.zero_grad()
        #self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # 软更新参数
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
        return (loss_critic.item(), loss_actor.item())

    def learning(self, memory):
        self.actor.train()
        return self._learn_from_memory(memory)

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(),
                   './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(),
                   './Models/' + str(episode_count) + '_critic.pt')

    def load_models(self, episode):
        self.actor.load_state_dict(
            torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(
            torch.load('./Models/' + str(episode) + '_critic.pt'))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        print('Models loaded successfully')
Example #24
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Example #25
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0
        if self.pic:
            self.nb_status = args.pic_status

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn,
            'init_method': args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array(
                [self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target(
                [next_state_batch,
                 self.actor_target(next_state_batch)])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic(
                [to_tensor(state_batch),
                 to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([state_batch, self.actor(state_batch)])
        else:
            policy_loss = -self.critic(
                [to_tensor(state_batch),
                 self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(
                    np.mean([
                        np.linalg.norm(p.grad.data.cpu().numpy().ravel())
                        for p in self.actor.parameters()
                    ]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad',
                                       mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if (self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if (self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
        if self.pic:
            action = np.concatenate(
                (softmax(action[:16]), softmax(action[16:])))
        return action

    def select_action(self,
                      s_t,
                      decay_epsilon=True,
                      return_fix=False,
                      noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(self.actor_target(s_t)).squeeze(0)
        else:
            action = to_numpy(self.actor(to_tensor(np.array([s_t
                                                             ])))).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = (action +
                      self.random_action(fix=True)) / 2.  # episilon greedy

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action

        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num)))

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
Example #26
0
class DDPG(object):
    def __init__(self):

        # random seed for torch
        __seed = config.get(MODEL_SEED)
        self.policy_loss = []
        self.critic_loss = []
        if __seed > 0:
            self.seed(__seed)

        self.nb_states = config.get(MODEL_STATE_COUNT)
        self.nb_actions = config.get(MODEL_ACTION_COUNT)

        # Create Actor and Critic Network
        actor_net_cfg = {
            'hidden1': config.get(MODEL_ACTOR_HIDDEN1),
            'hidden2': config.get(MODEL_ACTOR_HIDDEN2),
            'init_w': config.get(MODEL_INIT_WEIGHT)
        }
        critic_net_cfg = {
            'hidden1': config.get(MODEL_CRITIC_HIDDEN1),
            'hidden2': config.get(MODEL_CRITIC_HIDDEN2),
            'init_w': config.get(MODEL_INIT_WEIGHT)
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(
            self.actor.parameters(),
            lr=config.get(MODEL_ACTOR_LR),
            weight_decay=config.get(MODEL_ACTOR_WEIGHT_DECAY))

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(
            self.critic.parameters(),
            lr=config.get(MODEL_CRITIC_LR),
            weight_decay=config.get(MODEL_CRITIC_WEIGHT_DECAY))

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = Memory()

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.nb_actions,
            theta=config.get(RANDOM_THETA),
            mu=config.get(RANDOM_MU),
            sigma=config.get(RANDOM_SIGMA))

        # Hyper-parameters
        self.batch_size = config.get(MODEL_BATCH_SIZE)
        self.tau = config.get(MODEL_TARGET_TAU)
        self.discount = config.get(MODEL_DISCOUNT)
        self.depsilon = 1.0 / config.get(MODEL_EPSILON)

        self.model_path = config.get(MODEL_SAVE_PATH)

        #
        self.epsilon = 1.0

        # init device
        self.device_init()

    def update_policy(self, memory):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                self.actor_target(to_tensor(next_state_batch))
            ])

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        value_loss = F.mse_loss(q_batch, target_q_batch)

        value_loss.backward()
        self.critic_optim.step()
        self.critic_loss.append(value_loss.data[0])

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        self.policy_loss.append(policy_loss.data[0])

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def get_loss(self):
        return self.policy_loss, self.critic_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def device_init(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic.to(device)
        self.critic_target.to(device)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        return action

    def select_action(self, s_t):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)

        action += max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        return action

    def clean(self, decay_epsilon):
        if decay_epsilon:
            self.epsilon -= self.depsilon

    def reset(self):
        self.random_process.reset_states()

    def load_weights(self):
        if not os.path.exists(self.model_path):
            return

        actor_path = os.path.exists(os.path.join(self.model_path, 'actor.pkl'))
        if os.path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        critic_path = os.path.exists(
            os.path.join(self.model_path, 'critic.pkl'))
        if os.path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

    def save_model(self):
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        actor_path = os.path.exists(os.path.join(self.model_path, 'actor.pkl'))
        torch.save(self.actor.state_dict(), actor_path)

        critic_path = os.path.exists(
            os.path.join(self.model_path, 'critic.pkl'))
        torch.save(self.critic.state_dict(), critic_path)

    def get_model(self):
        return self.actor.state_dict(), self.critic.state_dict()

    def load_state_dict(self, actor_state, critic_state):
        self.actor.load_state_dict(actor_state)
        self.critic.load_state_dict(critic_state)

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)