Example #1
0
    def __init__(self, args, nb_states, nb_actions):
        USE_CUDA = torch.cuda.is_available()
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.gpu_ids = [i for i in range(args.gpu_nums)
                        ] if USE_CUDA and args.gpu_nums > 0 else [-1]
        self.gpu_used = True if self.gpu_ids[0] >= 0 else False

        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **net_cfg).double()
        self.actor_optim = Adam(self.actor.parameters(),
                                lr=args.p_lr,
                                weight_decay=args.weight_decay)

        self.critic = Critic(self.nb_states, self.nb_actions,
                             **net_cfg).double()
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **net_cfg).double()
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=args.c_lr,
                                 weight_decay=args.weight_decay)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau_update = args.tau_update
        self.gamma = args.gamma

        # Linear decay rate of exploration policy
        self.depsilon = 1.0 / args.epsilon
        # initial exploration rate
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        self.continious_action_space = False
Example #2
0
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        actor_net_cfg = {
            'hidden1': 32,
            'hidden2': 32,
            'hidden3': 32,
            'init_w': args.init_w
        }

        critic_net_cfg = {
            'hidden1': 64,
            'hidden2': 64,
            'hidden3': 64,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.best_reward = -10
Example #3
0
    def __init__(self, env, args):  #(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.env = env

        self.nb_states = self.env.observation_space.shape[0]
        self.nb_actions = self.env.action_space.shape[0]

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        self.load_weights(args.output)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA: self.cuda()
Example #4
0
    def __init__(self, in_channels, num_actions, config):
        super(DDPG, self).__init__()

        self.nb_states = in_channels
        self.nb_actions = num_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': config['hidden1'],
            'hidden2': config['hidden2'],
            # 'hidden3': config['hidden3'],
            # 'hidden4': config['hidden4'],
            'init_w': config['init_w']
        }

        self.loss = nn.MSELoss()
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=config['plr'])

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=config['lr'])

        if isGPU:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.observation = config['observation']
        self.config = config

        if config['use_memory']:
            self.experience_replay = SequentialMemory(limit=config['memory_size'], window_length=1)
        else:
            self.experience_replay = deque(maxlen=config['memory_size'])  # Create Buffer replay

        self.random_process = OUProcess(size=self.nb_actions, theta=config['ou_theta'], mu=config['ou_mu'],
                                        sigma=config['ou_sigma'])

        self.batch_size = config['batch_size']
        self.tau = config['tau']
        self.discount = config['discount']
        self.depsilon = 1. / config['epsilon_decay']

        self.epsilon = 1.0
Example #5
0
    def __init__(self, env, policy, gamma, tau, epsilon, epsilon_decay,
                 actor_lr, critic_lr, theta, sigma, mu, buffer_size):

        #self.num_states = num_states
        #self.num_actions = num_actions
        #self.is_training = False
        self.env = env

        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.theta = theta
        self.sigma = sigma
        self.mu = mu
        self.buffer_size = buffer_size

        self.policy = policy
        self.actor = policy.actor
        self.critic = policy.critic
        self.actor_target = policy.actor_target
        self.critic_target = policy.critic_target
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_lr)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_lr)
        self.criterion = nn.MSELoss()

        #the actor/actor_target and critic/critic_target need to have the same weights to start with
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.memory = SequentialMemory(limit=self.buffer_size, window_length=1)
        #self.replay = ExpcerienceReplay(BUFFER_SIZE,BATCH_SIZE)

        self.ou_noise = Ornstein_Uhlenbeck(theta=self.theta,
                                           sigma=self.sigma,
                                           mu=self.mu)

        if USE_CUDA: self.cuda()
Example #6
0
 def initialize_memory(self, stocks):
     self.memory = []
     for i in range(self.n_memory):
         self.memory.append(SequentialMemory(self.memory_length))
     for t in range(len(stocks) - 1):
         for idx_memory in range(self.n_memory):
             action = np.random.normal(0, self.noise_scale, self.n_stock)
             action = self.norm_action(action)
             reward = np.sum((stocks[t + 1] - stocks[t]) * action)
             self.memory[idx_memory].append(stocks[t], action, reward)
Example #7
0
 def initialize_memory(self, stocks, scale=10):
     self.memory = []
     for i in range(self.n_memory):
         self.memory.append(SequentialMemory(self.memory_length))
     for t in range(len(stocks)):
         for idx_memory in range(self.n_memory):
             action = None
             reward = np.concatenate(
                 (np.reshape(stocks[t],
                             (self.n_stock, 1)), np.zeros(
                                 (self.n_stock, 1))),
                 axis=-1)
             self.memory[idx_memory].append(stocks[t], action, reward)
Example #8
0
    def __init__(self, nb_states, nb_actions):
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE,
                                       window_length=HISTORY_LEN)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        # Hyper-parameters
        self.batch_size = BATCH_SIZE
        self.tau = TAU
        self.discount = GAMMA
        self.depsilon = 1.0 / DEPSILON

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        if USE_CUDA: self.cuda()
Example #9
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
            # resetting the environment. We need to pass in `terminal=False` here since
            # the *next* state, that is the state of the newly reset environment, is
            # always non-terminal by convention.
            forward(observation)
            backward(step, 0., terminal=False)
            episode_logs = {
                'episode_reward': episode_reward,
                'nb_episode_steps': episode_step,
                'nb_steps': step,
            }
            callback_list.on_episode_end(episode, episode_logs)
            episode += 1
            observation = None
            episode_step = None
            episode_reward = None


memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
checkpoint_weights_filename = os.path.join(root_dir, 'my_pacman_weights_weights_{step}.h5f')
callbacks = [MyCheckPoint(checkpoint_weights_filename, interval=100000, verbose=1)]
callbacks += [TrainEpisodeLogger()]
callbacks += [TrainIntervalLogger(interval=10000)]
trainable_model, target_model = compile(Adam(lr=.00025), metrics=['mae'])
fit(callbacks=callbacks, total_steps=10000000, verbose=1)






Example #11
0
                       STATE_SIZE, ACTION_COUNT, AGENT_HISTORY_LENGTH,
                       var_loss_coef1, var_loss_coef2, var_loss_coef3) +
            (float(args.compare_models[4 * I + 3]), ))

print(models_compare)

print(model_env.summary())

if not args.graph_file is None:
    from keras.utils import plot_model
    plot_model(model_env,
               to_file=args.graph_file,
               show_shapes=True,
               show_layer_names=True)

replay_buffer = SequentialMemory(max_size=REPLAY_MEMORY_SIZE)

if args.dqn_weight is None:
    model.load_weights('run2/weights_{0}.h5'.format(args.weight_idx * 125000))
else:
    model.load_weights(args.dqn_weight)
total_reward = 0.0
newGame()
epNo = 0

if args.mode == "train":
    loss_log = open(args.output_dir + '/loss.txt', "a")

if not args.mean_image is None:
    meanImage = np.load(args.mean_image)
else:
Example #12
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None     # Most recent state
        self.a_t = None     # Most recent action
        self.is_training = True

        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self, distribution='uniform'):
        '''
        Produce a random action
        '''
        if distribution == 'uniform':
            action = np.random.uniform(-1., 1., self.nb_actions)
            # set the action internally to the agent
            self.a_t = action
            return action
        else:
            raise ValueError('Distribution {} not defined'.format(distribution))

    def select_action(self, s_t, decay_epsilon=True, clip=None):
        '''
        Pick action according to actor network.
        :param s_t: current state s_t
        :param decay_epsilon: bool.
        :param clip: tuple to clip action values between
                     clip[0] and clip[1]. Default (-1, 1)
                     Set to false if not clip.
        '''
        # Set default for clip if None
        if clip is not False and clip is None:
                clip = (-1., 1.)

        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)

        # Add noise to the action.
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()

        if clip is not False:
            if len(clip) != 2:
                raise ValueError('Clip parameter malformed, received {}, \
                                  expected a size 2 tuple')
            action = np.clip(action, clip[0], clip[1])

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(
            torch.load('{}/actor.pkl'.format(output))
        )

        self.critic.load_state_dict(
            torch.load('{}/critic.pkl'.format(output))
        )

    def save_model(self, output):
        torch.save(
            self.actor.state_dict(),
            '{}/actor.pkl'.format(output)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic.pkl'.format(output)
        )

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Example #13
0
class DDPG(Agent):
    def __init__(self, in_channels, num_actions, config):
        super(DDPG, self).__init__()

        self.nb_states = in_channels
        self.nb_actions = num_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': config['hidden1'],
            'hidden2': config['hidden2'],
            # 'hidden3': config['hidden3'],
            # 'hidden4': config['hidden4'],
            'init_w': config['init_w']
        }

        self.loss = nn.MSELoss()
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=config['plr'])

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=config['lr'])

        if isGPU:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.observation = config['observation']
        self.config = config

        if config['use_memory']:
            self.experience_replay = SequentialMemory(limit=config['memory_size'], window_length=1)
        else:
            self.experience_replay = deque(maxlen=config['memory_size'])  # Create Buffer replay

        self.random_process = OUProcess(size=self.nb_actions, theta=config['ou_theta'], mu=config['ou_mu'],
                                        sigma=config['ou_sigma'])

        self.batch_size = config['batch_size']
        self.tau = config['tau']
        self.discount = config['discount']
        self.depsilon = 1. / config['epsilon_decay']

        self.epsilon = 1.0

    def select_action(self, state, test=False):
        value_c, value_d = self.actor.forward(to_variable(state, volatile=True))

        action_d = (F.softmax(value_d))
        action_d = to_numpy(action_d.multinomial())

        action_c = to_numpy(value_c)
        action_c += (max(self.epsilon, 0) * self.random_process.sample()) if not test else 0
        action_c = action_c[0]
        return action_c, action_d

    def update(self, state, action, reward, new_state, done):
        if self.config['use_memory']:
            self.experience_replay.append(
                new_state.numpy(), action.tolist(), reward, done)  # add new transition to dataset
        else:
            self.experience_replay.append((state, action.tolist(), reward, new_state, done))

        if done:
            self.random_process.reset_states()

        self.epsilon -= self.depsilon

        if len(self.experience_replay) >= self.observation:  # if have enough experience example, go
            # Sample batch from memory replay

            if self.config['use_memory']:
                state_batch, action_batch, reward_batch, \
                next_state_batch, terminal_batch = self.experience_replay.sample_and_split(self.batch_size)
                state_batch = state_batch.reshape(-1, 4, 80, 80)
                next_state_batch = next_state_batch.reshape(-1, 4, 80, 80)

            else:
                mini_batch = random.sample(self.experience_replay, self.batch_size)
                state_batch = torch.cat(mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size))
                action_batch = [mini_batch[k][1] for k in range(self.batch_size)]
                reward_batch = [mini_batch[k][2] for k in range(self.batch_size)]
                next_state_batch = torch.cat(mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size))
                terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)]

            # Prepare for the target q batch
            value_c, _ = self.actor_target.forward(to_variable(next_state_batch, volatile=True))
            next_q_values = self.critic_target.forward([to_variable(next_state_batch, volatile=True), value_c])
            next_q_values.volatile = False

            y_batch = to_variable(reward_batch) + self.discount * \
                to_variable(terminal_batch) * next_q_values

            # Critic update
            self.critic.zero_grad()

            q_batch = self.critic.forward([to_variable(state_batch), to_variable(action_batch)])

            value_loss = self.loss(q_batch, y_batch)
            value_loss.backward()
            self.critic_optim.step()

            # Actor update
            self.actor.zero_grad()

            value_c, _ = self.actor.forward(to_variable(state_batch))
            policy_loss = -self.critic.forward([to_variable(state_batch), value_c])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            self.actor_optim.step()

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, self.tau)

    def save(self, file_path):
        torch.save((self.actor.state_dict(), self.critic.state_dict()), file_path)
        print("save model to file successful")

    def load(self, file_path):
        state_dicts = torch.load(file_path, map_location=lambda storage, loc: storage)
        self.actor.load_state_dict(state_dicts[0])
        self.critic.load_state_dict(state_dicts[1])
        print("load model to file successful")
Example #14
0
class DDPG(object):
    def __init__(self, args, nb_states, nb_actions):
        USE_CUDA = torch.cuda.is_available()
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states =  nb_states
        self.nb_actions= nb_actions
        self.gpu_ids = [i for i in range(args.gpu_nums)] if USE_CUDA and args.gpu_nums > 0 else [-1]
        self.gpu_used = True if self.gpu_ids[0] >= 0 else False

        net_cfg = {
            'hidden1':args.hidden1,
            'hidden2':args.hidden2,
            'init_w':args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double()
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double()
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau_update = args.tau_update
        self.gamma = args.gamma

        # Linear decay rate of exploration policy
        self.depsilon = 1.0 / args.epsilon
        # initial exploration rate
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.is_training = True

        self.continious_action_space = False

    def update_policy(self):
        pass

    def cuda_convert(self):
        if len(self.gpu_ids) == 1:
            if self.gpu_ids[0] >= 0:
                with torch.cuda.device(self.gpu_ids[0]):
                    print('model cuda converted')
                    self.cuda()
        if len(self.gpu_ids) > 1:
            self.data_parallel()
            self.cuda()
            self.to_device()
            print('model cuda converted and paralleled')

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def data_parallel(self):
        self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids)
        self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids)
        self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids)
        self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids)

    def to_device(self):
        self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        # self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        # proto action
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])),
            gpu_used=self.gpu_used
        ).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon
        
        # self.a_t = action
        return action

    def reset(self, s_t):
        self.s_t = s_t
        self.random_process.reset_states()

    def load_weights(self, dir):
        if dir is None: return

        if self.gpu_used:
            # load all tensors to GPU (gpu_id)
            ml = lambda storage, loc: storage.cuda(self.gpu_ids)
        else:
            # load all tensors to CPU
            ml = lambda storage, loc: storage

        self.actor.load_state_dict(
            torch.load('output/{}/actor.pkl'.format(dir), map_location=ml)
        )

        self.critic.load_state_dict(
            torch.load('output/{}/critic.pkl'.format(dir), map_location=ml)
        )
        print('model weights loaded')


    def save_model(self,output):
        if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0:
            with torch.cuda.device(self.gpu_ids[0]):
                torch.save(
                    self.actor.state_dict(),
                    '{}/actor.pt'.format(output)
                )
                torch.save(
                    self.critic.state_dict(),
                    '{}/critic.pt'.format(output)
                )
        elif len(self.gpu_ids) > 1:
            torch.save(self.actor.module.state_dict(),
                       '{}/actor.pt'.format(output)
            )
            torch.save(self.actor.module.state_dict(),
                       '{}/critic.pt'.format(output)
                       )
        else:
            torch.save(
                self.actor.state_dict(),
                '{}/actor.pt'.format(output)
            )
            torch.save(
                self.critic.state_dict(),
                '{}/critic.pt'.format(output)
            )

    def seed(self,seed):
        torch.manual_seed(seed)
        if len(self.gpu_ids) > 0:
            torch.cuda.manual_seed_all(seed)
Example #15
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        actor_net_cfg = {
            'hidden1': 32,
            'hidden2': 32,
            'hidden3': 32,
            'init_w': args.init_w
        }

        critic_net_cfg = {
            'hidden1': 64,
            'hidden2': 64,
            'hidden3': 64,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.best_reward = -10

    def update_policy(self, shared_model, args):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size, shared=args.use_more_states, num_states=args.num_states)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic_optim.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        if args.shared:
            ensure_shared_grads(self.critic, shared_model.critic)

        self.critic_optim.step()

        # Actor update
        self.actor_optim.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        if args.shared:
            ensure_shared_grads(self.actor, shared_model.actor)
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def share_memory(self):
        self.critic.share_memory()
        self.actor.share_memory()

    def add_optim(self, actor_optim, critic_optim):
        self.actor_optim = actor_optim
        self.critic_optim = critic_optim

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_models(self, agent):
        self.actor = deepcopy(agent.actor)
        self.actor_target = deepcopy(agent.actor_target)
        self.critic = deepcopy(agent.critic)
        self.critic_target = deepcopy(agent.critic_target)
        self.actor_optim = deepcopy(agent.actor_optim)
        self.critic_optim = deepcopy(agent.critic_optim)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def train(self):
        self.critic.train()
        self.actor.train()

    def state_dict(self):
        return [
            self.actor.state_dict(),
            self.actor_target.state_dict(),
            self.critic.state_dict(),
            self.critic_target.state_dict()
        ]

    def load_state_dict(self, list_of_dicts):
        self.actor.load_state_dict(list_of_dicts[0])
        self.actor_target.load_state_dict(list_of_dicts[1])
        self.critic.load_state_dict(list_of_dicts[2])
        self.critic_target.load_state_dict(list_of_dicts[3])

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
Example #16
0
class UADDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        self.epistemic_actor = args.epistemic_actor  # true / false
        self.epistemic_critic = args.epistemic_critic  # true / false

        self.aleatoric_actor = args.aleatoric_actor  # true / false
        self.aleatoric_critic = args.aleatoric_critic  # true / false

        self.dropout_n_actor = args.dropout_n_actor
        self.dropout_n_critic = args.dropout_n_critic

        self.dropout_p_actor = args.dropout_p_actor
        self.dropout_p_critic = args.dropout_p_critic

        self.print_var_count = 0
        self.action_std = np.array([])
        self.save_dir = args.output
        self.episode = 0

        # self.save_file = open(self.save_dir + '/std.txt', "a")

        # Create Actor and Critic Network
        net_cfg_actor = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_actor,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        net_cfg_critic = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_critic,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        self.actor = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor)
        self.actor_target = UAActor(self.nb_states, self.nb_actions,
                                    **net_cfg_actor)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions,
                               **net_cfg_critic)
        self.critic_target = UACritic(self.nb_states, self.nb_actions,
                                      **net_cfg_critic)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q batch
        # TODO : Also apply epistemic and aleatoric uncertainty to both actor and critic target network
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor(
            terminal_batch.astype(np.float)) * next_q_values

        #########################
        #  Critic update
        #########################
        self.critic.zero_grad()

        # TODO : Add epistemic uncertainty for critic network
        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        # TODO : Add aleatoric uncertainty term from aleatoric uncertainty output of critic network (Add uncertainty term in criterion)
        value_loss = criterion(q_batch, target_q_batch)

        value_loss.backward()
        self.critic_optim.step()

        #########################
        #  Actor update
        #########################
        self.actor.zero_grad()

        # policy loss
        # TODO : Add epistemic certainty term from aleatoric certainty output of policy network
        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])
        policy_loss = policy_loss.mean()
        # policy_loss = policy_loss.mean() + actor_certainty

        policy_loss.backward()
        self.actor_optim.step()

        #########################
        #  Target soft update
        #########################
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    # def select_action(self, s_t, decay_epsilon=True):
    #     action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
    #     action += self.is_training*max(self.epsilon, 0)*self.random_process.sample()
    #
    #     if decay_epsilon:
    #         self.epsilon -= self.depsilon
    #
    #     self.a_t = action
    #     return action

    def select_action_with_dropout(self, s_t, decay_epsilon=True):
        dropout_actions = np.array([])

        with torch.no_grad():
            for _ in range(self.dropout_n):
                action = to_numpy(
                    self.actor.forward_with_dropout(to_tensor(np.array(
                        [s_t])))).squeeze(0)
                dropout_actions = np.append(dropout_actions, [action])

        if self.train_with_dropout:
            plt_action = to_numpy(
                self.actor.forward_with_dropout(to_tensor(np.array(
                    [s_t])))).squeeze(0)
            plt_action += self.is_training * max(
                self.epsilon, 0) * self.random_process.sample()

        else:
            plt_action = to_numpy(self.actor(to_tensor(np.array(
                [s_t])))).squeeze(0)
            plt_action += self.is_training * max(
                self.epsilon, 0) * self.random_process.sample()
        """
        UNFIXED RESET POINT for Mujoco
        """
        if self.print_var_count != 0 and (self.print_var_count + 1) % 999 == 0:
            # self.action_std = np.append(self.action_std, [np.std(dropout_actions)])

            with open(self.save_dir + "/std.txt", "a") as myfile:
                myfile.write(str(np.std(dropout_actions)) + '\n')
            with open(self.save_dir + "/mean.txt", "a") as myfile:
                myfile.write(str(np.mean(dropout_actions)) + '\n')

        if self.print_var_count % (1000 * 5) == 0:
            print("dropout actions std", np.std(dropout_actions),
                  "            ", "dir : ", str(self.save_dir))
        """
        FIXED RESET POINT for MCC
        """
        # if s_t[0] == -0.5 and s_t[1] == 0:
        #     # print("fixed dropout actions std", np.std(dropout_actions), "            ", "dir : ", str(self.save_dir))
        #     self.action_std = np.append(self.action_std, [np.std(dropout_actions)])
        #     # np.savetxt(self.save_dir + '/std.txt', self.action_std, fmt='%4.10f', delimiter=' ')
        #     with open(self.save_dir + "/std.txt", "a") as myfile:
        #         myfile.write(str(np.std(dropout_actions))+'\n')
        #     with open(self.save_dir + "/mean.txt", "a") as myfile:
        #         myfile.write(str(np.mean(dropout_actions))+'\n')

        if not (os.path.isdir(self.save_dir + "/episode/" +
                              str(self.episode))):
            os.makedirs(
                os.path.join(self.save_dir + "/episode/" + str(self.episode)))

        self.action_std = np.append(self.action_std, [np.std(dropout_actions)])
        with open(self.save_dir + "/episode/" + str(self.episode) + "/std.txt",
                  "a") as myfile:
            myfile.write(str(np.std(dropout_actions)) + '\n')

        with open(
                self.save_dir + "/episode/" + str(self.episode) + "/mean.txt",
                "a") as myfile:
            myfile.write(str(np.mean(dropout_actions)) + '\n')

        self.print_var_count = self.print_var_count + 1

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = plt_action

        return plt_action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Example #17
0
    def __init__(self, nb_states, nb_actions, now_date, now_time, args):
        print("UADDPG!!!!!!!!!!!!!!!!!!!!!!!!!")
        if args.seed > 0:
            self.seed(args.seed)

        self.total_training_step = 1
        self.episode = 0
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        # self.criterion = nn.MSELoss()
        self.critic_case = 'stochastic'
        self.actor = UAActor(self.nb_states, self.nb_actions, False, **net_cfg)
        self.actor_target = UAActor(self.nb_states, self.nb_actions, True,
                                    **net_cfg)

        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions, False,
                               **net_cfg)
        self.critic_target = UACritic(self.nb_states, self.nb_actions, True,
                                      **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.s_t_noise = None  # Most recent state
        self.a_t_mean = None  # Most recent action
        self.a_t_var = None
        self.is_training = True

        if torch.cuda.is_available():
            self.cuda()

        self.now_date = now_date
        self.now_time = now_time

        if os.path.exists('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                          self.now_time + '/') is False:
            os.mkdir('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                     self.now_time + '/')
Example #18
0
class DDPG_trainer(object):
    def __init__(self, nb_state, nb_action):
        self.nb_state = nb_state
        self.nb_action = nb_action

        self.actor = Actor(self.nb_state, self.nb_action)
        self.actor_target = Actor(self.nb_state, self.nb_action)
        self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE)

        self.critic = Critic(self.nb_state, self.nb_action)
        self.critic_target = Critic(self.nb_state, self.nb_action)
        self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_action,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        self.is_training = True
        self.epsilon = 1.0
        self.a_t = None
        self.s_t = None

        if USE_CUDA: self.cuda()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def select_action(self, s_t, decay_epsilon=True):

        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= DELTA_EPSILON

        self.a_t = action
        return action

    def reset(self, observation):
        self.start_state = observation
        self.random_process.reset_states()

    def observe(self, r_t, s_t1, done):

        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_all(self):
        # Help Warm Up
        if self.memory.nb_entries < BATCH_SIZE * 2:
            return

        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                self.actor_target(to_tensor(next_state_batch)),
            ])

        target_q_batch = to_tensor(reward_batch) + \
                         DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()
        for state in state_batch:
            if state.shape[0] <= 2:
                # print("Error sampled memory!")
                return

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        value_loss = CRITERION(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, TAU)
        soft_update(self.critic_target, self.critic, TAU)
Example #19
0
actor.add(Dense(8))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('sigmoid'))


action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + (11,), name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)

agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=10,
                  random_process=random_process, gamma=.995, target_model_update=1e-3)

agent.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
agent.fit(env, nb_steps=10000, visualize=False, verbose=0, nb_max_episode_steps=95)   
#agent.save_weights('weights/ddpg_{}_weights.h5f'.format("stormwater"), overwrite=True)
agent.test(env, nb_episodes=15, visualize=False, nb_max_episode_steps=95, plt="") 
model.add(Convolution2D(32, 8, 8, subsample=(4, 4), input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE))
model.add(Activation('relu'))
model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!
Example #21
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions):
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE,
                                       window_length=HISTORY_LEN)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        # Hyper-parameters
        self.batch_size = BATCH_SIZE
        self.tau = TAU
        self.discount = GAMMA
        self.depsilon = 1.0 / DEPSILON

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        if USE_CUDA: self.cuda()

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])[:, 0]
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()

        torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0)
        for p in self.critic.parameters():
            p.data.add_(-CRITIC_LR, p.grad.data)
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0)
        for p in self.actor.parameters():
            p.data.add_(-ACTOR_LR, p.grad.data)
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0]
        ou = self.random_process.sample()

        prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou))
        action += self.is_training * max(self.epsilon, 0) * ou
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Example #22
0
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        self.epistemic_actor = args.epistemic_actor  # true / false
        self.epistemic_critic = args.epistemic_critic  # true / false

        self.aleatoric_actor = args.aleatoric_actor  # true / false
        self.aleatoric_critic = args.aleatoric_critic  # true / false

        self.dropout_n_actor = args.dropout_n_actor
        self.dropout_n_critic = args.dropout_n_critic

        self.dropout_p_actor = args.dropout_p_actor
        self.dropout_p_critic = args.dropout_p_critic

        self.print_var_count = 0
        self.action_std = np.array([])
        self.save_dir = args.output
        self.episode = 0

        # self.save_file = open(self.save_dir + '/std.txt', "a")

        # Create Actor and Critic Network
        net_cfg_actor = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_actor,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        net_cfg_critic = {
            'dropout_n': args.dropout_n_actor,
            'dropout_p': args.dropout_p_critic,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }

        self.actor = UAActor(self.nb_states, self.nb_actions, **net_cfg_actor)
        self.actor_target = UAActor(self.nb_states, self.nb_actions,
                                    **net_cfg_actor)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions,
                               **net_cfg_critic)
        self.critic_target = UACritic(self.nb_states, self.nb_actions,
                                      **net_cfg_critic)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()
Example #23
0
	obs = obs[-2:]
	ob = np.array(obs)
	ob = np.max(ob, 0)
	return ob, tot_reward, done
	
import pickle
def serialize_array(a):
	return pickle.dumps(a, protocol=2) # protocol 0 is printable ASCII

def deserialize_array(serialized):
	return pickle.loads(serialized)

newGame()
done = False

replay_buffer = SequentialMemory(max_size=REPLAY_MEMORY_SIZE)
total_step_count = 0

print('GC', gc.isenabled())
#gc.set_debug(True)

while True:
	action = agent.act(ob, reward, done)

	ob, reward, done = gameStep(action)

	lastOb = lastFrame
	lastObCompressed = lastFrameCompressed
	lastObOrig = lastFrameOrig
	lastFrame, lastFrameOrig = preprocess(ob)
	lastFrameCompressed = lz4framed.compress(serialize_array(lastFrame))
Example #24
0
class DDPG():
    def __init__(self, env, policy, gamma, tau, epsilon, epsilon_decay,
                 actor_lr, critic_lr, theta, sigma, mu, buffer_size):

        #self.num_states = num_states
        #self.num_actions = num_actions
        #self.is_training = False
        self.env = env

        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.theta = theta
        self.sigma = sigma
        self.mu = mu
        self.buffer_size = buffer_size

        self.policy = policy
        self.actor = policy.actor
        self.critic = policy.critic
        self.actor_target = policy.actor_target
        self.critic_target = policy.critic_target
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_lr)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_lr)
        self.criterion = nn.MSELoss()

        #the actor/actor_target and critic/critic_target need to have the same weights to start with
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.memory = SequentialMemory(limit=self.buffer_size, window_length=1)
        #self.replay = ExpcerienceReplay(BUFFER_SIZE,BATCH_SIZE)

        self.ou_noise = Ornstein_Uhlenbeck(theta=self.theta,
                                           sigma=self.sigma,
                                           mu=self.mu)

        if USE_CUDA: self.cuda()

    def update(self):

        s, a, r, s_, done = self.memory.sample_and_split(64)
        #turn all numpy arrays into pytorch variables
        s = Variable(torch.from_numpy(s), requires_grad=False).type(FLOAT)
        a = Variable(torch.from_numpy(a), requires_grad=False).type(FLOAT)
        s_ = Variable(torch.from_numpy(s_), requires_grad=True).type(FLOAT)
        r = Variable(torch.from_numpy(r), requires_grad=False).type(FLOAT)
        done = Variable(torch.from_numpy(done),
                        requires_grad=False).type(FLOAT)
        #get target q value

        q = self.critic_target(s_, self.actor_target(s_, ))

        q_target_batch = r + self.gamma * done * q

        #update Critic my minimizing MSE Loss
        self.critic.zero_grad()
        q_batch = self.critic(s, a)
        L = self.criterion(q_batch, q_target_batch)
        L.backward()
        self.critic_optim.step()

        #update Actor by using the sampled policy gradient
        self.actor.zero_grad()
        policy_loss = -self.critic(s, self.actor(s))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        #update targets for the target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        self.epsilon -= self.epsilon_decay

    def remember(self, s, a, r, s_, done):
        #self.replay.remember(s,a,r,s_,done)
        self.memory.append(s, a, r, done)

    def select_random_action(self):
        return np.random.uniform(low=[0, -1], high=[1, 1], size=(2, ))

    def select_action(self, s):
        self.eval_mode()
        s = Variable(torch.from_numpy(s), volatile=False,
                     requires_grad=False).type(FLOAT)
        noise = Variable(torch.from_numpy(self.ou_noise.sample()),
                         volatile=False,
                         requires_grad=False).type(FLOAT)
        noise = self.epsilon * noise
        #noise = Variable(torch.from_numpy(np.random.normal(0,0.02, size=self.env.action_space.shape[0])), volatile=False, requires_grad=False).type(FLOAT)
        #s  = torch.FloatTensor(s).to(device)
        #print(s.size())
        #s.view(1, -1)
        action_pytorch = self.actor(s).squeeze(0)
        action = action_pytorch + noise
        #print(action_pytorch, action)
        action = action_pytorch.cpu().data.numpy(
        ) if USE_CUDA else action_pytorch.data.numpy()
        action[0] = np.clip(action[0], 0., 1.)
        action[1] = np.clip(action[1], -1., 1.)
        self.train_mode()
        return action

    def get_return(self, trajectory):
        """
        Calcualte discounted future rewards base on the trajectory of an entire episode
        """
        r = 0.0
        for i in range(len(trajectory)):
            r += self.gamma**i * trajectory[i]
        return r

    def reset(self):
        self.ou_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def eval_mode(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train_mode(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def seed(self, s):

        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)

    def save(self, PATH):
        self.policy.save(PATH)

    def load(self, PATH):
        self.policy.load(PATH)
Example #25
0
STATE_SIZE = 2048
var_loss_coef1 = K.variable(0)
var_loss_coef2 = K.variable(0)
var_loss_coef3 = K.variable(0)

model_env = model_state = model_next_state = model_next_state_auto = model_reward = meanImage = None

if not args.env_model is None:
	model_env, model_state, model_next_state, model_next_state_auto, model_reward = load_model(args.env_model, args.env_weight, args.env_reward_weight, STATE_SIZE, ACTION_COUNT, AGENT_HISTORY_LENGTH, 1, var_loss_coef1, var_loss_coef2, var_loss_coef3)	
	meanImage = np.load(args.env_mean_image)
	print(model_env.summary())

newGame()
done = False

replay_buffer = SequentialMemory(max_size=REPLAY_MEMORY_SIZE)
total_step_count = 0

#REPLAY_START_SIZE = 1000
#FINAL_EXPLORATION_FRAME = 50000
#REPLAY_START_SIZE = 5000
episode_reward = 0
epsilon = INITIAL_EXPLORATION

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

def weight_norms(model):
	ws = model.get_weights()
	for w in ws:
Example #26
0
class UADDPG(object):
    def __init__(self, nb_states, nb_actions, now_date, now_time, args):
        print("UADDPG!!!!!!!!!!!!!!!!!!!!!!!!!")
        if args.seed > 0:
            self.seed(args.seed)

        self.total_training_step = 1
        self.episode = 0
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        # self.criterion = nn.MSELoss()
        self.critic_case = 'stochastic'
        self.actor = UAActor(self.nb_states, self.nb_actions, False, **net_cfg)
        self.actor_target = UAActor(self.nb_states, self.nb_actions, True,
                                    **net_cfg)

        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = UACritic(self.nb_states, self.nb_actions, False,
                               **net_cfg)
        self.critic_target = UACritic(self.nb_states, self.nb_actions, True,
                                      **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.s_t_noise = None  # Most recent state
        self.a_t_mean = None  # Most recent action
        self.a_t_var = None
        self.is_training = True

        if torch.cuda.is_available():
            self.cuda()

        self.now_date = now_date
        self.now_time = now_time

        if os.path.exists('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                          self.now_time + '/') is False:
            os.mkdir('/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
                     self.now_time + '/')

    def update_policy(self):
        # print("Policy update starts...")
        # Sample batch
        state_batch, state_noise_batch, action_mean_batch, action_var_batch, reward_batch, next_state_batch, next_state_noise_batch, terminal_batch = self.memory.sample_and_split(
            self.batch_size)

        # Prepare for the target q

        with torch.no_grad():
            action_mean, action_var = self.actor_target(
                to_tensor(next_state_batch, volatile=True),
                to_tensor(next_state_noise_batch, volatile=True))
            next_q_values = self.critic_target(
                to_tensor(next_state_batch, volatile=True),
                to_tensor(next_state_noise_batch, volatile=True), action_mean,
                action_var)

        target_q_batch_mean = to_tensor(
            reward_batch) + self.discount * to_tensor(
                terminal_batch.astype(np.float)) * next_q_values[0]
        target_q_batch_var = to_tensor(
            reward_batch) + self.discount * to_tensor(
                terminal_batch.astype(np.float)) * next_q_values[1]

        # Critic update
        self.critic.zero_grad()

        # case 1 : Stochastic error (KL Divergence on both distribution)
        if self.critic_case == 'stochastic':

            q_batch = self.critic(to_tensor(state_batch),
                                  to_tensor(state_noise_batch),
                                  to_tensor(action_mean_batch),
                                  to_tensor(action_var_batch))
            value_loss = KLDLoss(q_batch[0], q_batch[1], target_q_batch_mean,
                                 target_q_batch_var)

        # case 2 : Deterministic error (MSE error)
        else:
            q_batch_sample = []
            target_q_batch_sample = []
            q_batch = self.critic(
                [to_tensor(state_batch), action_mean_batch, action_var_batch])
            for q_index in range(action_var_batch.shape[0]):
                q_batch_sample[
                    q_index] = q_batch[0][q_index] - q_batch[1][q_index]
                target_q_batch_sample[q_index] = target_q_batch_mean[
                    q_index] - target_q_match_var[q_index]
            value_loss = nn.MSE(q_batch_sample, target_q_batch_sample)

        value_loss.backward()

        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        action_mean, action_var = self.actor(to_tensor(state_batch),
                                             to_tensor(state_noise_batch))

        policy_loss_mean, policy_loss_var = self.critic(
            to_tensor(state_batch), to_tensor(state_noise_batch), action_mean,
            action_var)
        # policy_loss_mean = -policy_loss_mean

        if self.critic_case == 'stochastic':
            # policy_loss = policy_loss_mean.mean() + policy_loss_var.mean()
            policy_loss = policy_loss_mean.mean()
        else:
            policy_loss = (policy_loss_mean - policy_loss_var).mean()

        policy_loss.requires_grad = True
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        # print("Policy update ends...")

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_next_mean, s_next_var, done):
        if self.is_training:
            self.memory.append(self.s_t, self.s_t_noise, self.a_t_mean,
                               self.a_t_var, r_t, done)
            self.s_t = s_next_mean
            self.s_t_noise = s_next_var

    def random_action(self):
        action_mean = np.random.uniform(-1., 1., self.nb_actions)
        action_var = np.random.uniform(-2., 2., self.nb_actions)
        self.a_t_mean = action_mean
        self.a_t_var = action_var
        return action_mean

    def select_action(self, s_t, s_t_noise, decay_epsilon=True):
        action_mean, action_var = self.actor(to_tensor(np.array([s_t])),
                                             to_tensor(np.array([s_t_noise])))

        action_noise = []

        # amplification = 10000 - self.total_training_step / 100
        # if amplification < 1:
        #     amplification = 1
        amplification = 1

        for index in range(action_mean.shape[0]):
            action_noise.append(
                np.random.normal(0,
                                 action_var.cpu()[index] * amplification, 1))

        # action_mean += self.is_training * max(self.epsilon, 0) * self.random_process.sample()

        action_sample = action_mean + max(self.epsilon, 0) * torch.tensor(
            np.array(action_noise).squeeze()).cuda()

        # print("action_mean", action_mean)
        # print("action_noise", action_noise)
        # print("action_sample", action_sample)

        # action_sample = np.clip(action_sample, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t_mean = action_mean.cpu().numpy()
        self.a_t_var = action_var.cpu().numpy()
        self.total_training_step = self.total_training_step + 1

        action_mean_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_mean.txt', 'a')
        action_var_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_var.txt', 'a')
        action_noise_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_noise.txt', 'a')
        action_sample_file = open(
            '/mnt/sda2/DRL/UNIAC/model_' + self.now_date + '_' +
            self.now_time + '/action_sample.txt', 'a')

        action_mean_file.write(str(action_mean) + '\n')
        action_var_file.write(str(action_var) + '\n')
        action_noise_file.write(str(action_noise) + '\n')
        action_sample_file.write(str(action_sample) + '\n')

        action_mean_file.close()
        action_var_file.close()
        action_noise_file.close()
        action_sample_file.close()

        return action_sample.cpu().numpy()
        # return np.clip(action_sample.cpu().numpy(), -1.0, 1.0)

    def reset(self, obs, obs_noise):
        self.s_t = obs
        self.s_t_noise = obs_noise
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None: return

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)