コード例 #1
0
ファイル: agents.py プロジェクト: bic4907/MADDPG_simpletag
class Predator:
    def __init__(self, s_dim, a_dim, num_agent, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.num_agent = num_agent

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, num_agent)
        self.critic_target = Critic(s_dim, a_dim, num_agent)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.a_loss = 0
        self.c_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def random_action(self):
        action = np.random.uniform(low=-1.,
                                   high=1.,
                                   size=(self.num_agent, self.a_dim))
        return action

    def reset(self):
        self.random_process.reset_states()
コード例 #2
0
ファイル: train.py プロジェクト: renqibing/RL_DDPG
class DDPG_trainer(object):
    def __init__(self, nb_state, nb_action):
        self.nb_state = nb_state
        self.nb_action = nb_action

        self.actor = Actor(self.nb_state, self.nb_action)
        self.actor_target = Actor(self.nb_state, self.nb_action)
        self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE)

        self.critic = Critic(self.nb_state, self.nb_action)
        self.critic_target = Critic(self.nb_state, self.nb_action)
        self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_action,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        self.is_training = True
        self.epsilon = 1.0
        self.a_t = None
        self.s_t = None

        if USE_CUDA: self.cuda()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def select_action(self, s_t, decay_epsilon=True):

        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= DELTA_EPSILON

        self.a_t = action
        return action

    def reset(self, observation):
        self.start_state = observation
        self.random_process.reset_states()

    def observe(self, r_t, s_t1, done):

        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_all(self):
        # Help Warm Up
        if self.memory.nb_entries < BATCH_SIZE * 2:
            return

        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                self.actor_target(to_tensor(next_state_batch)),
            ])

        target_q_batch = to_tensor(reward_batch) + \
                         DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()
        for state in state_batch:
            if state.shape[0] <= 2:
                # print("Error sampled memory!")
                return

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        value_loss = CRITERION(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, TAU)
        soft_update(self.critic_target, self.critic, TAU)
コード例 #3
0
def run_agent(model_params, weights, state_transform, data_queue, weights_queue,
              process, global_step, updates, best_reward, param_noise_prob, save_dir,
              max_steps=10000000):

    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \
        build_model(**model_params)
    actor = Agent(actor_fn, params_actor, params_crit)
    actor.set_actor_weights(weights)

    env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
    random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput,
                                              sigma_min=0.05, n_steps_annealing=1e6)
    # prepare buffers for data
    states = []
    actions = []
    rewards = []
    terminals = []

    total_episodes = 0
    start = time()
    action_noise = True
    while global_step.value < max_steps:
        seed = random.randrange(2**32-2)
        state = env.reset(seed=seed, difficulty=2)
        random_process.reset_states()

        total_reward = 0.
        total_reward_original = 0.
        terminal = False
        steps = 0
        
        while not terminal:
            state = np.asarray(state, dtype='float32')
            action = actor.act(state)
            if action_noise:
                action += random_process.sample()

            next_state, reward, next_terminal, info = env.step(action)
            total_reward += reward
            total_reward_original += info['original_reward']
            steps += 1
            global_step.value += 1

            # add data to buffers
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            terminals.append(terminal)

            state = next_state
            terminal = next_terminal

            if terminal:
                break

        total_episodes += 1

        # add data to buffers after episode end
        states.append(state)
        actions.append(np.zeros(env.noutput))
        rewards.append(0)
        terminals.append(terminal)

        states_np = np.asarray(states).astype(np.float32)
        data = (states_np,
                np.asarray(actions).astype(np.float32),
                np.asarray(rewards).astype(np.float32),
                np.asarray(terminals),
                )
        weight_send = None
        if total_reward > best_reward.value:
            weight_send = actor.get_actor_weights()
        # send data for training
        data_queue.put((process, data, weight_send, total_reward))

        # receive weights and set params to weights
        weights = weights_queue.get()

        report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \
                     'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \
            format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps,
                   total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params')
        print(report_str)

        with open(os.path.join(save_dir, 'train_report.log'), 'a') as f:
            f.write(report_str + '\n')

        actor.set_actor_weights(weights)
        action_noise = np.random.rand() < 1 - param_noise_prob
        if not action_noise:
            set_params_noise(actor, states_np, random_process.current_sigma)

        # clear buffers
        del states[:]
        del actions[:]
        del rewards[:]
        del terminals[:]

        if total_episodes % 100 == 0:
            env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
コード例 #4
0
class BiCNet():
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()

    def choose_action(self, obs, noisy=True):
        obs = torch.Tensor([obs]).to(self.device)

        action = self.policy(obs).cpu().detach().numpy()[0]
        self.action_log.append(action)

        if noisy:
            for agent_idx in range(self.n_agents):
                pass
                # action[agent_idx] += self.epsilon * self.random_process.sample()
            self.epsilon -= self.depsilon
            self.epsilon = max(self.epsilon, 0.001)
        np.clip(action, -1., 1.)

        return action

    def reset(self):
        self.random_process.reset_states()
        self.action_log.clear()

    def prep_train(self):
        self.policy.train()
        self.critic.train()
        self.policy_target.train()
        self.critic_target.train()

    def prep_eval(self):
        self.policy.eval()
        self.critic.eval()
        self.policy_target.eval()
        self.critic_target.eval()

    def random_action(self):
        return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2))

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def train(self):

        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = torch.Tensor(state_batches).to(self.device)
        action_batches = torch.Tensor(action_batches).to(self.device)
        reward_batches = torch.Tensor(reward_batches).reshape(
            self.config.batch_size, self.n_agents, 1).to(self.device)
        next_state_batches = torch.Tensor(next_state_batches).to(self.device)
        done_batches = torch.Tensor(
            (done_batches == False) * 1).reshape(self.config.batch_size,
                                                 self.n_agents,
                                                 1).to(self.device)

        target_next_actions = self.policy_target.forward(next_state_batches)
        target_next_q = self.critic_target.forward(next_state_batches,
                                                   target_next_actions)
        main_q = self.critic(state_batches, action_batches)
        '''
        How to concat each agent's Q value?
        '''
        #target_next_q = target_next_q
        #main_q = main_q.mean(dim=1)
        '''
        Reward Norm
        '''
        # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines.detach())
        loss_critic.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optimizer.step()

        # Actor Loss
        self.policy.zero_grad()
        clear_action_batches = self.policy.forward(state_batches)
        loss_actor = -self.critic.forward(state_batches,
                                          clear_action_batches).mean()
        loss_actor += (clear_action_batches**2).mean() * 1e-3
        loss_actor.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
        self.policy_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.policy, self.policy_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def get_loss(self):
        return self.c_loss, self.a_loss

    def get_action_std(self):
        return np.array(self.action_log).std(axis=-1).mean()
コード例 #5
0
ファイル: agents.py プロジェクト: bic4907/MADDPG_simpletag
class Preyer:
    def __init__(self, s_dim, a_dim, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.device = 'cuda' if self.config.use_cuda else 'cpu'

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, 1)
        self.critic_target = Critic(s_dim, a_dim, 1)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.c_loss = 0
        self.a_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def choose_action(self, s, noisy=True):
        if self.config.use_cuda:
            s = Variable(torch.cuda.FloatTensor(s))
        else:
            s = Variable(torch.FloatTensor(s))
        a = self.actor.forward(s).cpu().detach().numpy()

        if noisy:
            a += max(self.epsilon, 0.001) * self.random_process.sample()
            self.epsilon -= self.depsilon
        a = np.clip(a, -1., 1.)

        return np.array([a])

    def random_action(self):
        action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim))
        return action

    def reset(self):
        self.random_process.reset_states()

    def train(self):
        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = Variable(torch.Tensor(state_batches).to(self.device))
        action_batches = Variable(
            torch.Tensor(action_batches).reshape(-1, 1).to(self.device))
        reward_batches = Variable(
            torch.Tensor(reward_batches).reshape(-1, 1).to(self.device))
        next_state_batches = Variable(
            torch.Tensor(next_state_batches).to(self.device))
        done_batches = Variable(
            torch.Tensor(
                (done_batches == False) * 1).reshape(-1, 1).to(self.device))

        target_next_actions = self.actor_target.forward(
            next_state_batches).detach()
        target_next_q = self.critic_target.forward(
            next_state_batches, target_next_actions).detach()

        main_q = self.critic(state_batches, action_batches)

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines)
        loss_critic.backward()
        self.critic_optimizer.step()

        # Actor Loss
        self.actor.zero_grad()
        clear_action_batches = self.actor.forward(state_batches)
        loss_actor = (
            -self.critic.forward(state_batches, clear_action_batches)).mean()
        loss_actor.backward()
        self.actor_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.actor, self.actor_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def getLoss(self):
        return self.c_loss, self.a_loss
コード例 #6
0
class DDPG(object):
    def __init__(self, args, nb_states, nb_actions):
        USE_CUDA = torch.cuda.is_available()
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states =  nb_states
        self.nb_actions= nb_actions
        self.gpu_ids = [i for i in range(args.gpu_nums)] if USE_CUDA and args.gpu_nums > 0 else [-1]
        self.gpu_used = True if self.gpu_ids[0] >= 0 else False

        net_cfg = {
            'hidden1':args.hidden1,
            'hidden2':args.hidden2,
            'init_w':args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double()
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double()
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau_update = args.tau_update
        self.gamma = args.gamma

        # Linear decay rate of exploration policy
        self.depsilon = 1.0 / args.epsilon
        # initial exploration rate
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.is_training = True

        self.continious_action_space = False

    def update_policy(self):
        pass

    def cuda_convert(self):
        if len(self.gpu_ids) == 1:
            if self.gpu_ids[0] >= 0:
                with torch.cuda.device(self.gpu_ids[0]):
                    print('model cuda converted')
                    self.cuda()
        if len(self.gpu_ids) > 1:
            self.data_parallel()
            self.cuda()
            self.to_device()
            print('model cuda converted and paralleled')

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def data_parallel(self):
        self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids)
        self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids)
        self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids)
        self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids)

    def to_device(self):
        self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        # self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        # proto action
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])),
            gpu_used=self.gpu_used
        ).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon
        
        # self.a_t = action
        return action

    def reset(self, s_t):
        self.s_t = s_t
        self.random_process.reset_states()

    def load_weights(self, dir):
        if dir is None: return

        if self.gpu_used:
            # load all tensors to GPU (gpu_id)
            ml = lambda storage, loc: storage.cuda(self.gpu_ids)
        else:
            # load all tensors to CPU
            ml = lambda storage, loc: storage

        self.actor.load_state_dict(
            torch.load('output/{}/actor.pkl'.format(dir), map_location=ml)
        )

        self.critic.load_state_dict(
            torch.load('output/{}/critic.pkl'.format(dir), map_location=ml)
        )
        print('model weights loaded')


    def save_model(self,output):
        if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0:
            with torch.cuda.device(self.gpu_ids[0]):
                torch.save(
                    self.actor.state_dict(),
                    '{}/actor.pt'.format(output)
                )
                torch.save(
                    self.critic.state_dict(),
                    '{}/critic.pt'.format(output)
                )
        elif len(self.gpu_ids) > 1:
            torch.save(self.actor.module.state_dict(),
                       '{}/actor.pt'.format(output)
            )
            torch.save(self.actor.module.state_dict(),
                       '{}/critic.pt'.format(output)
                       )
        else:
            torch.save(
                self.actor.state_dict(),
                '{}/actor.pt'.format(output)
            )
            torch.save(
                self.critic.state_dict(),
                '{}/critic.pt'.format(output)
            )

    def seed(self,seed):
        torch.manual_seed(seed)
        if len(self.gpu_ids) > 0:
            torch.cuda.manual_seed_all(seed)
コード例 #7
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
コード例 #8
0
ファイル: agent.py プロジェクト: 0123Andrew/submit_l2r
def run_agent(args,
              model_params,
              weights,
              data_queue,
              weights_queue,
              process,
              global_step,
              updates,
              best_reward,
              param_noise_prob,
              save_dir,
              max_steps=10000000):

    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(
        **model_params)
    actor = Agent(actor_fn, params_actor, params_crit)
    actor.set_actor_weights(weights)

    env = RunEnv2(model=args.modeldim,
                  prosthetic=args.prosthetic,
                  difficulty=args.difficulty,
                  skip_frame=config.skip_frames)
    env.spec.timestep_limit = 3000  # ndrw
    # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6)

    sigma_rand = random.uniform(0.05, 0.5)
    dt_rand = random.uniform(0.002, 0.02)
    param_noise_prob = random.uniform(param_noise_prob * 0.25,
                                      min(param_noise_prob * 1.5, 1.))

    random_process = OrnsteinUhlenbeckProcess(theta=.1,
                                              mu=0.,
                                              sigma=sigma_rand,
                                              dt=dt_rand,
                                              size=env.noutput,
                                              sigma_min=0.05,
                                              n_steps_annealing=1e6)

    print('OUProcess_sigma = ' + str(sigma_rand) + '    OUProcess_dt = ' +
          str(dt_rand) + '    param_noise_prob = ' + str(param_noise_prob))

    # prepare buffers for data
    states = []
    actions = []
    rewards = []
    terminals = []

    total_episodes = 0
    start = time()
    action_noise = True
    while global_step.value < max_steps:
        seed = random.randrange(2**32 - 2)
        state = env.reset(seed=seed, difficulty=args.difficulty)
        random_process.reset_states()

        total_reward = 0.
        total_reward_original = 0.
        terminal = False
        steps = 0

        while not terminal:
            state = np.asarray(state, dtype='float32')
            action = actor.act(state)
            if action_noise:
                action += random_process.sample()

            next_state, reward, next_terminal, info = env._step(action)
            total_reward += reward
            total_reward_original += info['original_reward']
            steps += 1
            global_step.value += 1

            # add data to buffers
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            terminals.append(terminal)

            state = next_state
            terminal = next_terminal

            if terminal:
                break

        total_episodes += 1

        # add data to buffers after episode end
        states.append(state)
        actions.append(np.zeros(env.noutput))
        rewards.append(0)
        terminals.append(terminal)

        states_np = np.asarray(states).astype(np.float32)
        data = (
            states_np,
            np.asarray(actions).astype(np.float32),
            np.asarray(rewards).astype(np.float32),
            np.asarray(terminals),
        )
        weight_send = None
        if total_reward > best_reward.value:
            weight_send = actor.get_actor_weights()
        # send data for training
        data_queue.put((process, data, weight_send, total_reward))

        # receive weights and set params to weights
        weights = weights_queue.get()

        # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \
        #     format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params')
        # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \
        #     format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params')
        report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \
            format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params')
        print(report_str)

        try:
            with open(os.path.join(save_dir, 'train_report.log'), 'a') as f:
                f.write(report_str + '\n')
        except:
            print('#############################################')
            print(
                'except  »  with open(os.path.join(save_dir, train_report.log), a) as f:'
            )
            print('#############################################')

        actor.set_actor_weights(weights)
        action_noise = np.random.rand() < 1 - param_noise_prob
        if not action_noise:
            set_params_noise(actor, states_np, random_process.current_sigma)

        # clear buffers
        del states[:]
        del actions[:]
        del rewards[:]
        del terminals[:]

        if total_episodes % 100 == 0:
            env = RunEnv2(model=args.modeldim,
                          prosthetic=args.prosthetic,
                          difficulty=args.difficulty,
                          skip_frame=config.skip_frames)
コード例 #9
0
class Agent(object):
    def __init__(self, nb_states, nb_actions, args):
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions, args.init_w)
        self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w)

        self.critic = Critic(self.nb_states, self.nb_actions, args.init_w)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    args.init_w)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.trajectory_length = args.trajectory_length
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.is_training = True

        #
        if USE_CUDA: self.cuda()

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        return action

    def select_action(self, state, noise_enable=True, decay_epsilon=True):
        action, _ = self.actor(to_tensor(np.array([state])))
        action = to_numpy(action).squeeze(0)
        if noise_enable == True:
            action += self.is_training * max(self.epsilon,
                                             0) * self.random_process.sample()

        action = np.clip(action, -1., 1.)
        if decay_epsilon:
            self.epsilon -= self.depsilon

        return action

    def reset_lstm_hidden_state(self, done=True):
        self.actor.reset_lstm_hidden_state(done)

    def reset(self):
        self.random_process.reset_states()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def load_weights(self, output):
        if output is None: return False

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

        return True

    def save_model(self, output):
        if not os.path.exists(output):
            os.mkdir(output)

        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
コード例 #10
0
ファイル: ddpg.py プロジェクト: mabingqi1/DRL
class UADDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        self.epistemic_actor = args.epistemic_actor  # true / false
        self.epistemic_critic = args.epistemic_critic  # true / false

        self.aleatoric_actor = args.aleatoric_actor  # true / false
        self.aleatoric_critic = args.aleatoric_critic  # true / false

        self.dropout_n_actor = args.dropout_n_actor
        self.dropout_n_critic = args.dropout_n_critic

        self.dropout_p_actor = args.dropout_p_actor
        self.dropout_p_critic = args.dropout_p_critic

        self.print_var_count = 0
        self.action_std = np.array([])
        self.save_dir = args.output
        self.episode = 0

        # self.save_file = open(self.save_dir + '/std.txt', "a")

        # Create Actor and Critic Network
        net_cfg_actor = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_actor,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        net_cfg_critic = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_critic,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        self.actor = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor)
        self.actor_target = UAActor(self.nb_states, self.nb_actions,
                                    **net_cfg_actor)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions,
                               **net_cfg_critic)
        self.critic_target = UACritic(self.nb_states, self.nb_actions,
                                      **net_cfg_critic)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q batch
        # TODO : Also apply epistemic and aleatoric uncertainty to both actor and critic target network
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor(
            terminal_batch.astype(np.float)) * next_q_values

        #########################
        #  Critic update
        #########################
        self.critic.zero_grad()

        # TODO : Add epistemic uncertainty for critic network
        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        # TODO : Add aleatoric uncertainty term from aleatoric uncertainty output of critic network (Add uncertainty term in criterion)
        value_loss = criterion(q_batch, target_q_batch)

        value_loss.backward()
        self.critic_optim.step()

        #########################
        #  Actor update
        #########################
        self.actor.zero_grad()

        # policy loss
        # TODO : Add epistemic certainty term from aleatoric certainty output of policy network
        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])
        policy_loss = policy_loss.mean()
        # policy_loss = policy_loss.mean() + actor_certainty

        policy_loss.backward()
        self.actor_optim.step()

        #########################
        #  Target soft update
        #########################
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    # def select_action(self, s_t, decay_epsilon=True):
    #     action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
    #     action += self.is_training*max(self.epsilon, 0)*self.random_process.sample()
    #
    #     if decay_epsilon:
    #         self.epsilon -= self.depsilon
    #
    #     self.a_t = action
    #     return action

    def select_action_with_dropout(self, s_t, decay_epsilon=True):
        dropout_actions = np.array([])

        with torch.no_grad():
            for _ in range(self.dropout_n):
                action = to_numpy(
                    self.actor.forward_with_dropout(to_tensor(np.array(
                        [s_t])))).squeeze(0)
                dropout_actions = np.append(dropout_actions, [action])

        if self.train_with_dropout:
            plt_action = to_numpy(
                self.actor.forward_with_dropout(to_tensor(np.array(
                    [s_t])))).squeeze(0)
            plt_action += self.is_training * max(
                self.epsilon, 0) * self.random_process.sample()

        else:
            plt_action = to_numpy(self.actor(to_tensor(np.array(
                [s_t])))).squeeze(0)
            plt_action += self.is_training * max(
                self.epsilon, 0) * self.random_process.sample()
        """
        UNFIXED RESET POINT for Mujoco
        """
        if self.print_var_count != 0 and (self.print_var_count + 1) % 999 == 0:
            # self.action_std = np.append(self.action_std, [np.std(dropout_actions)])

            with open(self.save_dir + "/std.txt", "a") as myfile:
                myfile.write(str(np.std(dropout_actions)) + '\n')
            with open(self.save_dir + "/mean.txt", "a") as myfile:
                myfile.write(str(np.mean(dropout_actions)) + '\n')

        if self.print_var_count % (1000 * 5) == 0:
            print("dropout actions std", np.std(dropout_actions),
                  "            ", "dir : ", str(self.save_dir))
        """
        FIXED RESET POINT for MCC
        """
        # if s_t[0] == -0.5 and s_t[1] == 0:
        #     # print("fixed dropout actions std", np.std(dropout_actions), "            ", "dir : ", str(self.save_dir))
        #     self.action_std = np.append(self.action_std, [np.std(dropout_actions)])
        #     # np.savetxt(self.save_dir + '/std.txt', self.action_std, fmt='%4.10f', delimiter=' ')
        #     with open(self.save_dir + "/std.txt", "a") as myfile:
        #         myfile.write(str(np.std(dropout_actions))+'\n')
        #     with open(self.save_dir + "/mean.txt", "a") as myfile:
        #         myfile.write(str(np.mean(dropout_actions))+'\n')

        if not (os.path.isdir(self.save_dir + "/episode/" +
                              str(self.episode))):
            os.makedirs(
                os.path.join(self.save_dir + "/episode/" + str(self.episode)))

        self.action_std = np.append(self.action_std, [np.std(dropout_actions)])
        with open(self.save_dir + "/episode/" + str(self.episode) + "/std.txt",
                  "a") as myfile:
            myfile.write(str(np.std(dropout_actions)) + '\n')

        with open(
                self.save_dir + "/episode/" + str(self.episode) + "/mean.txt",
                "a") as myfile:
            myfile.write(str(np.mean(dropout_actions)) + '\n')

        self.print_var_count = self.print_var_count + 1

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = plt_action

        return plt_action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
コード例 #11
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions):
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE,
                                       window_length=HISTORY_LEN)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        # Hyper-parameters
        self.batch_size = BATCH_SIZE
        self.tau = TAU
        self.discount = GAMMA
        self.depsilon = 1.0 / DEPSILON

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        if USE_CUDA: self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])[:, 0]
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()

        torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0)
        for p in self.critic.parameters():
            p.data.add_(-CRITIC_LR, p.grad.data)
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0)
        for p in self.actor.parameters():
            p.data.add_(-ACTOR_LR, p.grad.data)
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0]
        ou = self.random_process.sample()

        prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou))
        action += self.is_training * max(self.epsilon, 0) * ou
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
コード例 #12
0
ファイル: ddpg.py プロジェクト: sergeevii123/async_ddpg
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        actor_net_cfg = {
            'hidden1': 32,
            'hidden2': 32,
            'hidden3': 32,
            'init_w': args.init_w
        }

        critic_net_cfg = {
            'hidden1': 64,
            'hidden2': 64,
            'hidden3': 64,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.best_reward = -10

    def update_policy(self, shared_model, args):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size, shared=args.use_more_states, num_states=args.num_states)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic_optim.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        if args.shared:
            ensure_shared_grads(self.critic, shared_model.critic)

        self.critic_optim.step()

        # Actor update
        self.actor_optim.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        if args.shared:
            ensure_shared_grads(self.actor, shared_model.actor)
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def share_memory(self):
        self.critic.share_memory()
        self.actor.share_memory()

    def add_optim(self, actor_optim, critic_optim):
        self.actor_optim = actor_optim
        self.critic_optim = critic_optim

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_models(self, agent):
        self.actor = deepcopy(agent.actor)
        self.actor_target = deepcopy(agent.actor_target)
        self.critic = deepcopy(agent.critic)
        self.critic_target = deepcopy(agent.critic_target)
        self.actor_optim = deepcopy(agent.actor_optim)
        self.critic_optim = deepcopy(agent.critic_optim)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def train(self):
        self.critic.train()
        self.actor.train()

    def state_dict(self):
        return [
            self.actor.state_dict(),
            self.actor_target.state_dict(),
            self.critic.state_dict(),
            self.critic_target.state_dict()
        ]

    def load_state_dict(self, list_of_dicts):
        self.actor.load_state_dict(list_of_dicts[0])
        self.actor_target.load_state_dict(list_of_dicts[1])
        self.critic.load_state_dict(list_of_dicts[2])
        self.critic_target.load_state_dict(list_of_dicts[3])

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
コード例 #13
0
ファイル: ddpg.py プロジェクト: fducau/pytorch-ddpg
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None     # Most recent state
        self.a_t = None     # Most recent action
        self.is_training = True

        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self, distribution='uniform'):
        '''
        Produce a random action
        '''
        if distribution == 'uniform':
            action = np.random.uniform(-1., 1., self.nb_actions)
            # set the action internally to the agent
            self.a_t = action
            return action
        else:
            raise ValueError('Distribution {} not defined'.format(distribution))

    def select_action(self, s_t, decay_epsilon=True, clip=None):
        '''
        Pick action according to actor network.
        :param s_t: current state s_t
        :param decay_epsilon: bool.
        :param clip: tuple to clip action values between
                     clip[0] and clip[1]. Default (-1, 1)
                     Set to false if not clip.
        '''
        # Set default for clip if None
        if clip is not False and clip is None:
                clip = (-1., 1.)

        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)

        # Add noise to the action.
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()

        if clip is not False:
            if len(clip) != 2:
                raise ValueError('Clip parameter malformed, received {}, \
                                  expected a size 2 tuple')
            action = np.clip(action, clip[0], clip[1])

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(
            torch.load('{}/actor.pkl'.format(output))
        )

        self.critic.load_state_dict(
            torch.load('{}/critic.pkl'.format(output))
        )

    def save_model(self, output):
        torch.save(
            self.actor.state_dict(),
            '{}/actor.pkl'.format(output)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic.pkl'.format(output)
        )

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
コード例 #14
0
class DDPG(object):
    def __init__(self,
                 env,
                 mem_size=7 * int(1e3),
                 lr_critic=1e-3,
                 lr_actor=1e-4,
                 epsilon=1.,
                 max_epi=1500,
                 epsilon_decay=1. / (1e5),
                 gamma=.99,
                 target_update_frequency=200,
                 batch_size=64,
                 random_process=True,
                 max_step=None):
        self.CUDA = torch.cuda.is_available()

        self.orig_env = env  #for recording
        if max_step is not None:
            self.orig_env._max_episode_steps = max_step
        self.env = self.orig_env
        self.N_S = self.env.observation_space.shape[0]
        self.N_A = self.env.action_space.shape[0]
        self.MAX_EPI = max_epi
        self.LOW = self.env.action_space.low
        self.HIGH = self.env.action_space.high

        self.actor = Actor(self.N_S, self.N_A)
        self.critic = Critic(self.N_S, self.N_A)
        self.target_actor = Actor(self.N_S, self.N_A)
        self.target_critic = Critic(self.N_S, self.N_A)
        self.target_actor.eval()
        self.target_critic.eval()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        if self.CUDA:
            self.actor.cuda()
            self.critic.cuda()
            self.target_actor.cuda()
            self.target_critic.cuda()

        self.exp = Experience(mem_size)
        self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor)
        self.random_process = OrnsteinUhlenbeckProcess(\
                size=self.N_A, theta=.15, mu=0, sigma=.2)
        self.EPSILON = epsilon
        self.EPSILON_DECAY = epsilon_decay
        self.GAMMA = gamma
        self.TARGET_UPDATE_FREQUENCY = target_update_frequency
        self.BATCH_SIZE = batch_size

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        self.data = pd.DataFrame(title)
        self.RAND_PROC = random_process

    def train(self, dir=None, interval=1000):
        if dir is not None:
            self.env = wrappers.Monitor(self.orig_env,
                                        '{}/train_record'.format(dir),
                                        force=True)
            os.mkdir(os.path.join(dir, 'models'))
        update_counter = 0
        epsilon = self.EPSILON
        for epi in trange(self.MAX_EPI, desc='train epi', leave=True):
            self.random_process.reset_states()
            o = self.env.reset()

            counter = 0
            acc_r = 0
            while True:
                counter += 1

                #if dir is not None:
                #    self.env.render()

                a = self.choose_action(o)

                if self.RAND_PROC:
                    a += max(epsilon, 0) * self.random_process.sample()
                    a = np.clip(a, -1., 1.)
                    epsilon -= self.EPSILON_DECAY

                o_, r, done, info = self.env.step(self.map_to_action(a))
                self.exp.push(o, a, r, o_, done)

                if epi > 0:
                    self.update_actor_critic()
                    update_counter += 1
                    if update_counter % self.TARGET_UPDATE_FREQUENCY == 0:
                        self.update_target()

                acc_r += r
                o = o_
                if done:
                    break
            if dir is not None:
                if (epi + 1) % interval == 0:
                    self.save(os.path.join(dir, 'models'),
                              str(epi + 1),
                              save_data=False)
            s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R])
            self.data = self.data.append(s, ignore_index=True)

    def choose_action(self, state):
        self.actor.eval()
        s = Variable(torch.Tensor(state)).unsqueeze(0)
        if self.CUDA:
            s = s.cuda()
        a = self.actor(s).data.cpu().numpy()[0].astype('float64')
        self.actor.train()
        return a

    def map_to_action(self, a):
        return (self.LOW + self.HIGH) / 2 + a * (self.HIGH - self.LOW) / 2

    def update_target(self):
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

    def update_actor_critic(self):
        # sample minibatch
        minibatch = common.Transition(*zip(*self.exp.sample(self.BATCH_SIZE)))
        bat_o = Variable(torch.Tensor(minibatch.state))
        bat_a = Variable(torch.Tensor(minibatch.action))
        bat_r = Variable(torch.Tensor(minibatch.reward)).unsqueeze(1)
        bat_o_ = Variable(torch.Tensor(minibatch.next_state))
        bat_not_done_mask = list(
            map(lambda done: 0 if done else 1, minibatch.done))
        bat_not_done_mask = Variable(
            torch.ByteTensor(bat_not_done_mask)).unsqueeze(1)
        if self.CUDA:
            bat_o = bat_o.cuda()
            bat_a = bat_a.cuda()
            bat_r = bat_r.cuda()
            bat_o_ = bat_o_.cuda()
            bat_not_done_mask = bat_not_done_mask.cuda()

        # update critic
        bat_a_o_ = self.target_actor(bat_o_)

        Gt = bat_r
        Gt[bat_not_done_mask] += self.GAMMA * self.target_critic(
            bat_o_, bat_a_o_)[bat_not_done_mask]
        Gt.detach_()
        eval_o = self.critic(bat_o, bat_a)
        criterion = nn.MSELoss()
        if self.CUDA:
            criterion.cuda()
        loss = criterion(eval_o, Gt)
        self.optim_critic.zero_grad()
        loss.backward()
        self.optim_critic.step()

        # update actor
        self.critic.eval()
        bat_a_o = self.actor(bat_o)
        obj = torch.mean(self.critic(bat_o, bat_a_o))
        self.optim_actor.zero_grad()
        obj.backward()
        self.optim_actor.step()
        self.critic.train()

    def test(self, dir=None, n=1):
        if dir is not None:
            self.env = wrappers.Monitor(self.orig_env,
                                        '{}/test_record'.format(dir),
                                        force=True,
                                        video_callable=lambda episode_id: True)

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        df = pd.DataFrame(title)

        for epi in trange(n, desc='test epi', leave=True):
            o = self.env.reset()
            acc_r = 0
            while True:
                #if dir is not None:
                #    self.env.render()
                a = self.choose_action(o)
                o_, r, done, info = self.env.step(self.map_to_action(a))
                acc_r += r
                o = o_
                if done:
                    break
            s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R])
            df = df.append(s, ignore_index=True)
        if dir is not None:
            df.to_csv('{}/test_data.csv'.format(dir))
        else:
            print df

    def save(self, dir, suffix='', save_data=True):
        torch.save(self.actor.state_dict(),
                   '{}/actor{}.pt'.format(dir, suffix))
        torch.save(self.critic.state_dict(),
                   '{}/critic{}.pt'.format(dir, suffix))
        if save_data:
            self.data.to_csv('{}/train_data{}.csv'.format(dir, suffix))

    def load_actor(self, dir):
        self.actor.load_state_dict(torch.load(dir))

    def load_critic(self, dir):
        self.critic.load_state_dict(torch.load(dir))

    def get_data(self):
        return self.data
コード例 #15
0
class UADDPG(object):
    def __init__(self, nb_states, nb_actions, now_date, now_time, args):
        print("UADDPG!!!!!!!!!!!!!!!!!!!!!!!!!")
        if args.seed > 0:
            self.seed(args.seed)

        self.total_training_step = 1
        self.episode = 0
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        # self.criterion = nn.MSELoss()
        self.critic_case = 'stochastic'
        self.actor = UAActor(self.nb_states, self.nb_actions, False, **net_cfg)
        self.actor_target = UAActor(self.nb_states, self.nb_actions, True,
                                    **net_cfg)

        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions, False,
                               **net_cfg)
        self.critic_target = UACritic(self.nb_states, self.nb_actions, True,
                                      **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.s_t_noise = None  # Most recent state
        self.a_t_mean = None  # Most recent action
        self.a_t_var = None
        self.is_training = True

        if torch.cuda.is_available():
            self.cuda()

        self.now_date = now_date
        self.now_time = now_time

        if os.path.exists('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                          self.now_time + '/') is False:
            os.mkdir('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                     self.now_time + '/')

    def update_policy(self):
        # print("Policy update starts...")
        # Sample batch
        state_batch, state_noise_batch, action_mean_batch, action_var_batch, reward_batch, next_state_batch, next_state_noise_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q

        with torch.no_grad():
            action_mean, action_var = self.actor_target(
                to_tensor(next_state_batch, volatile=True),
                to_tensor(next_state_noise_batch, volatile=True))
            next_q_values = self.critic_target(
                to_tensor(next_state_batch, volatile=True),
                to_tensor(next_state_noise_batch, volatile=True), action_mean,
                action_var)

        target_q_batch_mean = to_tensor(
            reward_batch) + self.discount * to_tensor(
                terminal_batch.astype(np.float)) * next_q_values[0]
        target_q_batch_var = to_tensor(
            reward_batch) + self.discount * to_tensor(
                terminal_batch.astype(np.float)) * next_q_values[1]

        # Critic update
        self.critic.zero_grad()

        # case 1 : Stochastic error (KL Divergence on both distribution)
        if self.critic_case == 'stochastic':

            q_batch = self.critic(to_tensor(state_batch),
                                  to_tensor(state_noise_batch),
                                  to_tensor(action_mean_batch),
                                  to_tensor(action_var_batch))
            value_loss = KLDLoss(q_batch[0], q_batch[1], target_q_batch_mean,
                                 target_q_batch_var)

        # case 2 : Deterministic error (MSE error)
        else:
            q_batch_sample = []
            target_q_batch_sample = []
            q_batch = self.critic(
                [to_tensor(state_batch), action_mean_batch, action_var_batch])
            for q_index in range(action_var_batch.shape[0]):
                q_batch_sample[
                    q_index] = q_batch[0][q_index] - q_batch[1][q_index]
                target_q_batch_sample[q_index] = target_q_batch_mean[
                    q_index] - target_q_match_var[q_index]
            value_loss = nn.MSE(q_batch_sample, target_q_batch_sample)

        value_loss.backward()

        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        action_mean, action_var = self.actor(to_tensor(state_batch),
                                             to_tensor(state_noise_batch))

        policy_loss_mean, policy_loss_var = self.critic(
            to_tensor(state_batch), to_tensor(state_noise_batch), action_mean,
            action_var)
        # policy_loss_mean = -policy_loss_mean

        if self.critic_case == 'stochastic':
            # policy_loss = policy_loss_mean.mean() + policy_loss_var.mean()
            policy_loss = policy_loss_mean.mean()
        else:
            policy_loss = (policy_loss_mean - policy_loss_var).mean()

        policy_loss.requires_grad = True
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        # print("Policy update ends...")

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_next_mean, s_next_var, done):
        if self.is_training:
            self.memory.append(self.s_t, self.s_t_noise, self.a_t_mean,
                               self.a_t_var, r_t, done)
            self.s_t = s_next_mean
            self.s_t_noise = s_next_var

    def random_action(self):
        action_mean = np.random.uniform(-1., 1., self.nb_actions)
        action_var = np.random.uniform(-2., 2., self.nb_actions)
        self.a_t_mean = action_mean
        self.a_t_var = action_var
        return action_mean

    def select_action(self, s_t, s_t_noise, decay_epsilon=True):
        action_mean, action_var = self.actor(to_tensor(np.array([s_t])),
                                             to_tensor(np.array([s_t_noise])))

        action_noise = []

        # amplification = 10000 - self.total_training_step / 100
        # if amplification < 1:
        #     amplification = 1
        amplification = 1

        for index in range(action_mean.shape[0]):
            action_noise.append(
                np.random.normal(0,
                                 action_var.cpu()[index] * amplification, 1))

        # action_mean += self.is_training * max(self.epsilon, 0) * self.random_process.sample()

        action_sample = action_mean + max(self.epsilon, 0) * torch.tensor(
            np.array(action_noise).squeeze()).cuda()

        # print("action_mean", action_mean)
        # print("action_noise", action_noise)
        # print("action_sample", action_sample)

        # action_sample = np.clip(action_sample, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t_mean = action_mean.cpu().numpy()
        self.a_t_var = action_var.cpu().numpy()
        self.total_training_step = self.total_training_step + 1

        action_mean_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_mean.txt', 'a')
        action_var_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_var.txt', 'a')
        action_noise_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_noise.txt', 'a')
        action_sample_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_sample.txt', 'a')

        action_mean_file.write(str(action_mean) + '\n')
        action_var_file.write(str(action_var) + '\n')
        action_noise_file.write(str(action_noise) + '\n')
        action_sample_file.write(str(action_sample) + '\n')

        action_mean_file.close()
        action_var_file.close()
        action_noise_file.close()
        action_sample_file.close()

        return action_sample.cpu().numpy()
        # return np.clip(action_sample.cpu().numpy(), -1.0, 1.0)

    def reset(self, obs, obs_noise):
        self.s_t = obs
        self.s_t_noise = obs_noise
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
コード例 #16
0
class DDPG(object):
    def __init__(self):

        # random seed for torch
        __seed = config.get(MODEL_SEED)
        self.policy_loss = []
        self.critic_loss = []
        if __seed > 0:
            self.seed(__seed)

        self.nb_states = config.get(MODEL_STATE_COUNT)
        self.nb_actions = config.get(MODEL_ACTION_COUNT)

        # Create Actor and Critic Network
        actor_net_cfg = {
            'hidden1': config.get(MODEL_ACTOR_HIDDEN1),
            'hidden2': config.get(MODEL_ACTOR_HIDDEN2),
            'init_w': config.get(MODEL_INIT_WEIGHT)
        }
        critic_net_cfg = {
            'hidden1': config.get(MODEL_CRITIC_HIDDEN1),
            'hidden2': config.get(MODEL_CRITIC_HIDDEN2),
            'init_w': config.get(MODEL_INIT_WEIGHT)
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(
            self.actor.parameters(),
            lr=config.get(MODEL_ACTOR_LR),
            weight_decay=config.get(MODEL_ACTOR_WEIGHT_DECAY))

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(
            self.critic.parameters(),
            lr=config.get(MODEL_CRITIC_LR),
            weight_decay=config.get(MODEL_CRITIC_WEIGHT_DECAY))

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = Memory()

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.nb_actions,
            theta=config.get(RANDOM_THETA),
            mu=config.get(RANDOM_MU),
            sigma=config.get(RANDOM_SIGMA))

        # Hyper-parameters
        self.batch_size = config.get(MODEL_BATCH_SIZE)
        self.tau = config.get(MODEL_TARGET_TAU)
        self.discount = config.get(MODEL_DISCOUNT)
        self.depsilon = 1.0 / config.get(MODEL_EPSILON)

        self.model_path = config.get(MODEL_SAVE_PATH)

        #
        self.epsilon = 1.0

        # init device
        self.device_init()

    def update_policy(self, memory):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                self.actor_target(to_tensor(next_state_batch))
            ])

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        value_loss = F.mse_loss(q_batch, target_q_batch)

        value_loss.backward()
        self.critic_optim.step()
        self.critic_loss.append(value_loss.data[0])

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        self.policy_loss.append(policy_loss.data[0])

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def get_loss(self):
        return self.policy_loss, self.critic_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def device_init(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic.to(device)
        self.critic_target.to(device)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        return action

    def select_action(self, s_t):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)

        action += max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        return action

    def clean(self, decay_epsilon):
        if decay_epsilon:
            self.epsilon -= self.depsilon

    def reset(self):
        self.random_process.reset_states()

    def load_weights(self):
        if not os.path.exists(self.model_path):
            return

        actor_path = os.path.exists(os.path.join(self.model_path, 'actor.pkl'))
        if os.path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        critic_path = os.path.exists(
            os.path.join(self.model_path, 'critic.pkl'))
        if os.path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

    def save_model(self):
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)
        actor_path = os.path.exists(os.path.join(self.model_path, 'actor.pkl'))
        torch.save(self.actor.state_dict(), actor_path)

        critic_path = os.path.exists(
            os.path.join(self.model_path, 'critic.pkl'))
        torch.save(self.critic.state_dict(), critic_path)

    def get_model(self):
        return self.actor.state_dict(), self.critic.state_dict()

    def load_state_dict(self, actor_state, critic_state):
        self.actor.load_state_dict(actor_state)
        self.critic.load_state_dict(critic_state)

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)