def __init__(self, env, time_steps, hidden_dim):
        self.name = 'DDPG'  # name for uploading results
        self.scale = env.asset
        self.unit = env.unit
        self.seed = env.rd_seed

        self.time_dim = time_steps
        self.state_dim = env.observation_space.shape[1]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = 64
        self.memory_size = self.time_dim + self.batch_size * 10
        self.start_size = self.time_dim + self.batch_size * 2

        # Initialise actor & critic networks
        self.actor_network = Actor(self.time_dim, self.state_dim,
                                   self.action_dim, hidden_dim)
        self.critic_network = Critic(self.time_dim, self.state_dim,
                                     self.action_dim, hidden_dim)

        # Initialize replay buffer
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros(
            (self.start_size - 1, 1, self.state_dim), device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim,
                                         sigma=0.01 / self.action_dim)
        self.initial()
Exemple #2
0
    def __init__(
        self,
        obs_dim,
        action_dim,
        action_gain,
        actor_learning_rate=0.0001,
        critic_learning_rate=0.001,
        gamma=0.99,
        tau=0.001,
    ):
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau

        # make main networks
        self.actor = Actor(obs_dim, action_dim, action_gain,
                           actor_learning_rate)
        self.critic = Critic(obs_dim, action_dim, critic_learning_rate)

        # make target networks
        self.target_actor = Actor(obs_dim, action_dim, action_gain)
        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic = Critic(obs_dim, action_dim)
        self.target_critic.model.set_weights(self.critic.model.get_weights())
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.eps = eps_start
        self.t_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
Exemple #4
0
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPS

        #--- actor -----#

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=1e-3)

        #---- critic -----#

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=1e-3,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
    def __init__(self, state_size, action_size, random_seed, num_agents):
        """Initialize an Agent object.
         """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, sigma=0.1)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.num_agents = num_agents
    def __init__(
        self,
        state_size=24,
        action_size=2,
        BATCH_SIZE=128,
        BUFFER_SIZE=int(1e6),
        discount_factor=1,
        tau=1e-2,
        noise_coefficient_start=5,
        noise_coefficient_decay=0.99,
        LR_ACTOR=1e-3,
        LR_CRITIC=1e-3,
        WEIGHT_DECAY=1e-3,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):
        """
			state_size (int): dimension of each state
			action_size (int): dimension of each action
			BATCH_SIZE (int): mini batch size
			BUFFER_SIZE (int): experience storing lenght, keep it as high as possible
			discount_factor (float): discount factor for calculating Q_target
			tau (float): interpolation parameter for updating target network
			noise_coefficient_start (float): value to be multiplied to OUNoise sample
			noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample
			LR_ACTOR (float): learning rate for actor network
			LR_CRITIC (float): learning rate for critic network
			WEIGHT_DECAY (float): Weight decay for critic network optimizer
			device : "cuda:0" if torch.cuda.is_available() else "cpu"
		"""

        self.state_size = state_size
        print(device)
        self.action_size = action_size
        self.BATCH_SIZE = BATCH_SIZE
        self.BUFFER_SIZE = BUFFER_SIZE
        self.discount_factor = discount_factor
        self.tau = tau
        self.noise_coefficient = noise_coefficient_start
        self.noise_coefficient_decay = noise_coefficient_decay
        self.steps_completed = 0
        self.device = device
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(self.device)
        self.actor_target = Actor(state_size, action_size).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(self.device)
        self.critic_target = Critic(state_size, action_size).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((1, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE)
Exemple #7
0
    def __init__(self,
                 env: gym.Env,
                 memory_size: int,
                 batch_size: int,
                 ou_noise_theta: float,
                 ou_noise_sigma: float,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 initial_random_episode: int = 1e4,
                 name_cases='myproject'):
        """ Initialize. """

        # Logger
        self.wandb = wandb.init(project=name_cases)

        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.env = env
        self.memory = ReplayBuffer(memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.initial_random_episode = initial_random_episode

        # noise
        self.noise = OUNoise(
            action_dim,
            theta=ou_noise_theta,
            sigma=ou_noise_sigma,
        )

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        print(self.device)

        # networks
        self.actor = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())

        self.critic = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # transition to store in memory
        self.transition = list()

        # total steps count
        self.total_step = 0
        # mode: train / test
        self.is_test = False
        self.populate(self.initial_random_episode)
Exemple #8
0
 def create_critic(self, alpha, hidden_layers):
     params = {
         'input_shape':      self.env.observation_space.shape,
         'output_shape':     self.env.action_space.shape,
         'hidden_layers':    hidden_layers
     }
     self.critic = OpenStruct()
     self.critic.online = Critic("{}.critic.online".format(self.name), **params)
     self.critic.target = Critic("{}.critic.target".format(self.name), **params)
    def test_critic(self):
        Actor_obj = Actor(1, 16, 4)
        Critic_obj = Critic(4, 16, 1)
        # critic_optimizer = optim.SGD(Critic_obj.parameters(), lr=C_learning_rate)

        y = Actor_obj.forward(torch.FloatTensor([1]))
        # Forward Propagation
        y_pred = Critic_obj.forward(y)
        self.assertTrue(len(y_pred) == 1)
Exemple #10
0
    def __init__(self, n, state_size, action_size, random_seed, params):
        """Initialize an Agent object.
        
        Params
        ======
            n (int): number of agents in env
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            params (dict): dictionary with hyperparameters name-value pairs
        """
        self.n = n
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.BUFFER_SIZE = params["BUFFER_SIZE"]
        self.BATCH_SIZE = params["BATCH_SIZE"]
        self.GAMMA = params["GAMMA"]
        self.TAU = params["TAU"]
        self.LR_ACTOR = params["LR_ACTOR"]
        self.LR_CRITIC = params["LR_CRITIC"]
        self.WEIGHT_DECAY = params["WEIGHT_DECAY"]
        self.N_UPDATES = params["N_UPDATES"]
        self.UPDATE_STEP = params["UPDATE_STEP"]

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(self.n, action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

        #Count timesteps
        self.timestep = 0
Exemple #11
0
    def __init__(self,
                 init_pose=None,
                 init_velocities=None,
                 init_angle_velocities=None,
                 runtime=5.,
                 target_pos=None,
                 buffer_size=150000,
                 batch_size=32,
                 gamma=0.99,
                 replay_alpha=0.5,
                 beta_limit=10000):

        self.task = Task(init_pose, init_velocities, init_angle_velocities,
                         runtime, target_pos)

        self.state_size = self.task.state_size
        self.action_size = self.task.action_size

        self.state = self.task.reset()

        self.memory = PrioritizedReplay(buffer_size, batch_size, replay_alpha,
                                        beta_limit)

        self.actor = Actor(self.state_size, self.action_size,
                           self.task.action_low, self.task.action_high)
        self.actor_weights = self.actor.model.trainable_weights
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.task.action_low, self.task.action_high)

        self.critic = Critic(self.state_size, self.action_size)
        self.critic_weights = self.critic.model.trainable_weights
        self.critic_target = Critic(self.state_size, self.action_size)

        self.gamma = gamma

        # how much influence older weights have when updating target
        self.tau = 0.03

        #noise
        # GENTLE LANDING
        #self.mu = 0
        #self.theta = 0.1
        #self.sigma = 25
        self.mu = 0
        self.theta = 0.1
        self.sigma = 9
        self.noise = Noise(self.action_size, self.mu, self.theta, self.sigma)

        self.episodes = 0
        self.training_step = 0
Exemple #12
0
    def __init__(self,
                 env,
                 act_dim,
                 state_dim,
                 goal_dim,
                 act_range,
                 buffer_size=int(1e6),
                 gamma=0.98,
                 lr=0.001,
                 tau=0.95):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = state_dim + goal_dim
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.env = env

        # Create actor and critic networks
        self.actor_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        self.critic_network = Critic(self.env_dim, act_dim, act_range)
        self.critic_target_network = Critic(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # Optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=lr)

        # Replay buffer
        # self.buffer = MemoryBuffer(buffer_size)
        self.buffer = ReplayMemory(buffer_size)

        # Normalizers
        self.goal_normalizer = Normalizer(
            goal_dim, default_clip_range=5)  # Clip between [-5, 5]
        self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
    def test_actor(self):
        Actor_obj = Actor(1, 16, 4)
        Critic_obj = Critic(4, 16, 1)
        # actor_optimizer = optim.SGD(Actor_obj.parameters(), lr=0.1, momentum=0.5)

        # Forward Propagation
        y = Actor_obj.forward(torch.FloatTensor([1]))
        self.assertTrue(len(y) == 4)
Exemple #14
0
class ActorCritic(object):
    def __init__(self, env):
        LR_A = 0.001  # learning rate for actor
        LR_C = 0.01  # learning rate for critic
        num_features = env.observation_space.shape[0]
        # num_features = 14
        num_actions = env.action_space.shape[0]

        self.action_space = env.action_space

        sess = tf.Session()
        self.actor = Actor(
            sess,
            n_features=num_features,
            action_bound=[env.action_space.low[0], env.action_space.high[0]],
            lr=LR_A)
        self.critic = Critic(
            sess, n_features=num_features, lr=LR_C
        )  # we need a good teacher, so the teacher should learn faster than the actor
        sess.run(tf.global_variables_initializer())

    def get_action(self, state, episode_percentage):
        # state = state[0:14]

        # Sometimes pick random action to explore
        if np.random.random() < self.get_exploration_prob(episode_percentage):
            # print 'random'
            return self.action_space.sample()
        else:
            # print 'not random'
            return self.actor.choose_action(state)[0]

    def get_exploration_prob(self, episode_percentage):
        # if (episode_percentage > .8):
        # 	epsilon = 0.3
        # else:
        epsilon = -1 * (episode_percentage**2) + 1
        # epsilon = -1 * (episode_percentage - 1) ** 3
        # epsilon = -0.8 * (episode_percentage - 1) ** 3 + 0.2
        # epsilon = -0.8 * episode_percentage + 1
        # print epsilon
        return epsilon

    def update(self, state, action, reward, new_state):
        # state = state[0:14]
        # new_state = new_state[0:14]

        td_error = self.critic.learn(
            state, reward,
            new_state)  # gradient = grad[r + gamma * V(s_) - V(s)]
        # print td_error
        self.actor.learn(
            state, action,
            td_error)  # true_gradient = grad[logPi(s,a) * td_error]

    def get_name(self):
        return 'ActorCritic'
Exemple #15
0
    def __init__(self, env):
        LR_A = 0.001  # learning rate for actor
        LR_C = 0.01  # learning rate for critic
        num_features = env.observation_space.shape[0]
        # num_features = 14
        num_actions = env.action_space.shape[0]

        self.action_space = env.action_space

        sess = tf.Session()
        self.actor = Actor(
            sess,
            n_features=num_features,
            action_bound=[env.action_space.low[0], env.action_space.high[0]],
            lr=LR_A)
        self.critic = Critic(
            sess, n_features=num_features, lr=LR_C
        )  # we need a good teacher, so the teacher should learn faster than the actor
        sess.run(tf.global_variables_initializer())
Exemple #16
0
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)
Exemple #17
0
    def __init__(self,
                 n_state,
                 n_action,
                 a_limit,
                 model_folder=None,
                 memory_size=10000,
                 batch_size=32,
                 tau=0.01,
                 gamma=0.99,
                 var=3.0):
        # Record the parameters
        self.n_state = n_state
        self.n_action = n_action
        self.a_limit = a_limit
        self.memory_size = memory_size
        self.model_folder = model_folder
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.var = var

        # Create the network and related objects
        self.memory = np.zeros(
            [self.memory_size, 2 * self.n_state + self.n_action + 1],
            dtype=np.float32)
        self.memory_counter = 0
        self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit)
        self.eval_critic = Critic(self.n_state, self.n_action)
        self.target_actor = Actor(self.n_state,
                                  self.n_action,
                                  self.a_limit,
                                  trainable=False)
        self.target_critic = Critic(self.n_state,
                                    self.n_action,
                                    trainable=False)

        self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001)
        self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002)
        self.criterion = nn.MSELoss()

        # Make sure the parameter of target network is the same as evaluate network
        self.hardCopy()
Exemple #18
0
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)
Exemple #19
0
def MountainCar():
    env = gym.make('MountainCar-v0')
    env = env.unwrapped
    env.reset()
    env.render()

    n_features = env.observation_space.shape[0]
    n_actions = env.action_space.n

    sess = tf.Session()

    actor = Actor(sess, n_features, n_actions, lr=LR_A)
    critic = Critic(sess, n_features, lr=LR_C)
    sess.run(tf.global_variables_initializer())

    game = Game(env, actor, critic)
    game.run_mountain_car()
Exemple #20
0
def main(args):
    with tf.device(args['device']):

        # tf
        tf.set_random_seed(args['rand_seed'])
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        # env
        env = gym.make('TestEnv-v0')
        env.seed(args['rand_seed'])
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]
        concat_dim = 2
        batched_s_dim = [None, s_dim, concat_dim]
        batched_a_dim = [None, a_dim]

        # agents
        actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'],
                      args['clip_val'], batched_s_dim, batched_a_dim)
        critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'],
                        batched_s_dim, batched_a_dim)

        # experience
        exp = Experience(args['buffer_size'], args['batch_size'],
                         args['rand_seed'])

        # noise
        actor_noise = ActorNoise(actor.predict,
                                 a_dim,
                                 noise_type=args['noise_type'])

        # initialize
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = Model(sess, args['restore_path'])
        saver.restore_model()

        # training
        her = HER(saver, exp, env, actor, critic, actor_noise)
        if args['mode'] == 'train':
            her.train(args['gamma'], args['her_k'], args['max_episodes'],
                      args['max_episode_len'], args['replay_len'])
        else:
            her.play(args['max_episodes'], args['max_episode_len'])
Exemple #21
0
def CartPoleAC():
    env1 = gym.make('CartPole-v0')
    # env2 = gym.make('CartPole-v0')
    env1.seed(10)
    # env2.seed(2)
    env1 = env1.unwrapped
    env1.reset()
    # env2 = env2.unwrapped
    # env2.reset()

    n_features = env1.observation_space.shape[0]
    n_actions = env1.action_space.n

    sess = tf.Session()

    actor = Actor(sess, n_features, n_actions, lr=LR_A)
    critic = Critic(sess, n_features, lr=LR_C)
    sess.run(tf.global_variables_initializer())

    g = Game(env1, actor, critic)
    g.run()
Exemple #22
0
    def __init__(self,
                 env,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2):

        self.states = env.observation_space
        self.state_size = env.observation_space.shape[0]
        self.actions = env.action_space
        self.action_size = env.action_space.shape[0]
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)
Exemple #23
0
class Agent():
    def __init__(self,
                 env,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2):

        self.states = env.observation_space
        self.state_size = env.observation_space.shape[0]
        self.actions = env.action_space
        self.action_size = env.action_space.shape[0]
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, step, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()

        if epsilon:
            noise = np.random.normal(0, 0.1, action.shape[0])
            action += noise

        return action

    def update(self, step):

        state, action, reward, next_state, done = self.memory.sample()

        next_state_action = self.target_actor(next_state)

        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
Exemple #24
0
class DDPG:
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)

    def update_target(self):
        # Two methods to update the target actor
        # Method 1:
        self.target_actor.set_weights(
            np.array(self.actor.get_weights()) * self.TAU +
            np.array(self.target_actor.get_weights()) * (1 - self.TAU))
        self.target_critic.set_weights(
            np.array(self.critic.get_weights()) * self.TAU +
            np.array(self.target_critic.get_weights()) * (1 - self.TAU))
        """
        # Method 2:
        new_weights = []
        target_variables = self.target_critic.weights
        for i, variable in enumerate(self.critic.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))

        self.target_critic.set_weights(new_weights)
        new_weights = []
        target_variables = self.target_actor.weights
        for i, variable in enumerate(self.actor.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))
        self.target_actor.set_weights(new_weights)
        """

    def train_step(self):
        s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch(
            self.minibatch_size)
        """
        mu_prime = self.target_actor(s2_batch)  # predictions by target actor
        Q_prime = self.target_critic([s2_batch, mu_prime])  # predictions by target critic
        y = np.zeros_like(Q_prime)
        for k in range(self.minibatch_size):
            if d_batch[k]:
                y[k] = r_batch[k]
            else:
                y[k] = r_batch[k] + self.GAMMA * Q_prime[k]
        # y = r_batch + gamma * Q_prime

        checkpoint_path = "training/cp_critic.ckpt"
        checkpoint_dir = os.path.dirname(checkpoint_path)
        # Create a callback that saves the model's weights
        cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,
                                                          save_weights_only=True,
                                                          verbose=1)
        self.critic.train_on_batch([s_batch, a_batch], y)
        # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1])

        with tf.GradientTape(persistent=True) as tape:
            a = self.actor(s_batch)
            tape.watch(a)
            theta = self.actor.trainable_variables
            q = self.critic([s_batch, a])
        dq_da = tape.gradient(q, a)
        da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da)
        self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables))
        """

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(s2_batch)
            y = r_batch + self.GAMMA * self.target_critic(
                [s2_batch, target_actions])
            critic_value = self.critic([s_batch, a_batch])
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic_opt.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor(s_batch)
            q = self.critic([s_batch, actions])  # critic_value
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(q)
        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))
        self.update_target()
        return np.mean(q)

    def policy(self, s):
        # since batch normalization is done on self.actor, it is multiplied with upper_bound
        if s.ndim == 1:
            s = s[None, :]
        action = self.actor(s) * self.upper_bound + self.ou_noise()
        action = np.clip(action, self.lower_bound, self.upper_bound)
        return action

    def train(self):
        # To store reward history of each episode
        ep_reward_list = []
        # To store average reward history of last few episodes
        avg_reward_list = []
        monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2)
        with Loop_handler(
        ) as interruption:  # to properly save even if ctrl+C is pressed
            for eps in range(self.EPISODES):
                episode_reward = 0
                s = self.env.reset()
                """
                if an env is created using the "gym.make" method, it will terminate after 200 steps
                """
                for t in range(self.MAX_TIME_STEPS):
                    # done = False
                    # while not done:
                    if self.render:
                        self.env.render()
                    a = self.policy(s)
                    s_, r, done, _ = self.env.step(a)
                    self.replay_buffer.add(np.reshape(s, (self.s_dim, )),
                                           np.reshape(a, (self.a_dim, )),
                                           r, done,
                                           np.reshape(s_, (self.s_dim, )))
                    episode_reward += r
                    if self.replay_buffer.size() > self.minibatch_size:
                        q = self.train_step()
                    s = s_.reshape(1, -1)
                    if interruption():
                        break
                ep_reward_list.append(episode_reward)
                # Mean of last 40 episodes
                avg_reward = np.mean(ep_reward_list[-40:])
                print("Episode * {} * Avg Reward is ==> {}".format(
                    eps, avg_reward))
                avg_reward_list.append(avg_reward)
                monitor.add_data(avg_reward, q)

            self.save_weights(
                save_name=self.save_name)  # if you want to save weights
            self.plot_results(avg_reward=avg_reward_list, train=True)

    def save_weights(self, save_name='final_weights'):
        self.actor.save_weights("training/%s_actor.h5" % save_name)
        self.critic.save_weights("training/%s_critic.h5" % save_name)
        self.target_actor.save_weights("training/%s_target_actor.h5" %
                                       save_name)
        self.target_critic.save_weights("training/%s_target_critic.h5" %
                                        save_name)

        # to save in other format
        self.target_actor.save_weights('training/%s_actor_weights' % save_name,
                                       save_format='tf')
        self.target_critic.save_weights('training/%s_critic_weights' %
                                        save_name,
                                        save_format='tf')
        print('Training completed and network weights saved')

    # For evaluation of the policy learned
    def collect_data(self, act_net, iterations=1000):
        a_all, states_all = [], []
        obs = self.env.reset()
        for t in range(iterations):
            obs = np.squeeze(obs)
            if obs.ndim == 1:
                a = act_net(obs[None, :])
            else:
                a = act_net(obs)
            obs, _, done, _ = self.env.step(a)
            states_all.append(obs)
            a_all.append(a)
            # self.env.render()  # Uncomment this to see the actor in action (But not in python notebook)
            # if done:
            #     break
        states = np.squeeze(
            np.array(states_all))  # cos(theta), sin(theta), theta_dot
        a_all = np.squeeze(np.array(a_all))
        return states, a_all

    def plot_results(self,
                     avg_reward=None,
                     actions=None,
                     states=None,
                     train=False,
                     title=None):
        # An additional way to visualize the avg episode rewards
        if train:
            plt.figure()
            plt.plot(avg_reward)
            plt.xlabel("Episode")
            plt.ylabel("Avg. Epsiodic Reward")
            plt.show()
        else:  # work only for Pendulum-v0 environment
            fig, ax = plt.subplots(3, sharex=True)
            theta = np.arctan2(states[:, 1], states[:, 0])
            ax[0].set_ylabel('u')
            ax[0].plot(np.squeeze(actions))
            ax[1].set_ylabel(u'$\\theta$')
            ax[1].plot(theta)
            # ax[1].plot(states[:, 0])
            ax[2].set_ylabel(u'$\omega$')
            ax[2].plot(states[:, 2])  # ang velocity
            fig.canvas.set_window_title(title)
Exemple #25
0
class DDPG:
    def __init__(
        self,
        obs_dim,
        action_dim,
        action_gain,
        actor_learning_rate=0.0001,
        critic_learning_rate=0.001,
        gamma=0.99,
        tau=0.001,
    ):
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau

        # make main networks
        self.actor = Actor(obs_dim, action_dim, action_gain,
                           actor_learning_rate)
        self.critic = Critic(obs_dim, action_dim, critic_learning_rate)

        # make target networks
        self.target_actor = Actor(obs_dim, action_dim, action_gain)
        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic = Critic(obs_dim, action_dim)
        self.target_critic.model.set_weights(self.critic.model.get_weights())

    def act(self, obs):
        return self.actor.act(obs)[0]

    @tf.function
    def update_networks(self, batch):
        """ runs all updates from provided training batch """
        s, a, r, t, next_s = (tf.cast(i, tf.float32) for i in batch)
        self.update_critic(s, a, r, next_s)
        self.update_actor(s, a, r, next_s)
        self.update_target(self.actor.model, self.target_actor.model)
        self.update_target(self.critic.model, self.target_critic.model)

    @tf.function
    def update_critic(self, s, a, r, next_s):
        """ minimize td-loss from target """
        # td estimate based on targets' behavior
        target_future_actions = self.target_actor.act(next_s)
        target_future_qs = self.target_critic.estimate_q(
            next_s, target_future_actions)
        target_current_qs = r + self.gamma * target_future_qs

        # update main critic
        main_current_qs = self.critic.model([s, a])
        loss = keras.losses.mse(target_current_qs, main_current_qs)

        model_vars = self.critic.model.trainable_variables

        dloss_dcrit = tf.gradients(loss, model_vars)
        self.critic.optimizer.apply_gradients(zip(dloss_dcrit, model_vars))

    @tf.function
    def update_actor(self, s, a, r, next_s):
        """ dq_dtheta = dq_da * da_dtheta"""
        # first, finding dq_da
        proposed_a = self.actor.model(s)
        q = self.critic.model([s, proposed_a])
        dq_da = tf.gradients(q, proposed_a)[0]

        # second, finding dq_da * da_dtheta
        model_vars = self.actor.model.trainable_variables
        dq_dtheta = tf.gradients(proposed_a, model_vars, grad_ys=-dq_da)

        # updating the model
        self.actor.optimizer.apply_gradients(zip(dq_dtheta, model_vars))

    @tf.function
    def update_target(self, main_model, target_model):
        """ target = tau*main + (1-tau)*target """
        for model_weight, target_weight in zip(main_model.weights,
                                               target_model.weights):
            target_weight.assign(self.tau * model_weight +
                                 (1 - self.tau) * target_weight)

    def save_model(self, save_dir):
        """ saves the main and target networks"""
        self.actor.model.save_weights(os.path.join(save_dir, "actor"))
        self.critic.model.save_weights(os.path.join(save_dir, "critic"))
        self.target_actor.model.save_weights(
            os.path.join(save_dir, "target_actor"))
        self.target_critic.model.save_weights(
            os.path.join(save_dir, "target_critic"))
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, time_steps, hidden_dim):
        self.name = 'DDPG'  # name for uploading results
        self.scale = env.asset
        self.unit = env.unit
        self.seed = env.rd_seed

        self.time_dim = time_steps
        self.state_dim = env.observation_space.shape[1]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = 64
        self.memory_size = self.time_dim + self.batch_size * 10
        self.start_size = self.time_dim + self.batch_size * 2

        # Initialise actor & critic networks
        self.actor_network = Actor(self.time_dim, self.state_dim,
                                   self.action_dim, hidden_dim)
        self.critic_network = Critic(self.time_dim, self.state_dim,
                                     self.action_dim, hidden_dim)

        # Initialize replay buffer
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros(
            (self.start_size - 1, 1, self.state_dim), device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim,
                                         sigma=0.01 / self.action_dim)
        self.initial()

    def initial(self):
        self.steps = 0
        self.action = torch.zeros(self.action_dim, device=cuda)
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros((self.start_size - 1, self.state_dim),
                                         device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

    def train_on_batch(self):
        # Sample a random minibatch of N transitions from replay buffer
        sample = torch.randint(self.time_dim,
                               self.replay_reward.shape[0], [self.batch_size],
                               device=cuda)
        index = torch.stack([sample - i for i in range(self.time_dim, 0, -1)
                             ]).t().reshape(-1)

        state_data = min_max_scale(self.replay_state[:, 0, :])
        amount_data = min_max_scale(self.replay_state[:, 2, :])
        next_state_data = min_max_scale(self.replay_next_state[:, 0, :])
        next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])

        state_batch = torch.index_select(state_data, 0,
                                         index).view(self.batch_size, -1)
        amount_data = torch.index_select(amount_data, 0,
                                         sample).view(self.batch_size, -1)
        state_batch = torch.cat([state_batch, amount_data], dim=1)
        next_state_batch = torch.index_select(next_state_data, 0,
                                              index).view(self.batch_size, -1)
        next_amount_data = torch.index_select(next_amount_data, 0,
                                              sample).view(
                                                  self.batch_size, -1)
        next_state_batch = torch.cat([next_state_batch, next_amount_data],
                                     dim=1)
        action_batch = torch.index_select(self.replay_action / self.unit, 0,
                                          sample)
        reward_batch = torch.index_select(self.replay_reward, 0, sample)

        # Calculate y_batch
        next_action_batch = self.actor_network.target_action(next_state_batch)
        q_batch = self.critic_network.target_q(next_action_batch,
                                               next_state_batch)
        y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1)

        # train actor-critic by target loss
        self.actor_network.train(
            self.critic_network.train(y_batch, action_batch, state_batch))

        # Update target networks by soft update
        self.actor_network.update_target()
        self.critic_network.update_target()

    def perceive(self, state, action, reward, next_state, done):
        if self.steps < self.start_size - 1:
            self.replay_state[self.steps] = state
            self.replay_next_state[self.steps] = next_state
            self.replay_action[self.steps] = action
            self.replay_reward[self.steps] = reward
        else:
            if self.steps >= self.memory_size:
                self.replay_state = self.replay_state[1:]
                self.replay_next_state = self.replay_next_state[1:]
                self.replay_action = self.replay_action[1:]
                self.replay_reward = self.replay_reward[1:]
            self.replay_state = torch.cat(
                (self.replay_state, state.unsqueeze(0)), dim=0)
            self.replay_next_state = torch.cat(
                (self.replay_next_state, next_state.unsqueeze(0)), dim=0)
            self.replay_action = torch.cat(
                (self.replay_action, action.unsqueeze(0)), dim=0)
            self.replay_reward = torch.cat(
                (self.replay_reward, reward.unsqueeze(0)), dim=0)
        self.steps += 1

    def act(self, next_state, portfolio):
        if self.steps > self.start_size:
            next_state_data = min_max_scale(
                self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1)
            next_amount_data = min_max_scale(
                self.replay_next_state[:, 2, :])[-1].view(1, -1)
            next_state_data = torch.cat([next_state_data, next_amount_data],
                                        dim=1)
            self.train_on_batch()
            allocation = self.actor_network.target_action(
                next_state_data).data.view(-1)
            allocation += torch.tensor(self.exploration_noise.noise().tolist(),
                                       device=cuda)
            allocation[allocation < 0] = 0
            allocation /= sum(allocation)
            allocation = torch.floor(portfolio * allocation /
                                     next_state[1, :] / self.unit) * self.unit
            self.action = allocation
        return self.action.clone()
Exemple #27
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor.forward(state).cpu().data.numpy()
        self.actor.train()

        if epsilon:
            #if we want to inject some noise
            noise = np.random.normal(0, self.action_sigma, action.shape[0])
            action += noise

        return action

    def update(self, step):
        '''
        #https: // arxiv.org / pdf / 1802.09477.pdf
        the function is very similar to typical DDPG algorithm, except for
        1) we have 2 critics to update
        2) we take the min of the 2 values critics output
        3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper)
        4) We delay updating the actor by certain steps

        :param step: how often to update the actor
        :return:
        '''

        state, action, reward, next_state, done = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_state_action = self.target_actor(next_state)

        #sample a random noise
        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss

        #as mentioned in the paper, we delay updating the actor network.

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # ----------------------- update target networks ------------------- #
            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
Exemple #28
0
class ActorCriticExperienceReplay(object):
    def __init__(self, env):
        self.MEMORY_SIZE = 200
        self.BATCH_SIZE = 10

        LR_A = 0.001  # learning rate for actor
        LR_C = 0.01  # learning rate for critic
        num_features = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        self.action_space = env.action_space

        sess = tf.Session()
        self.actor = Actor(
            sess,
            n_features=num_features,
            action_bound=[env.action_space.low[0], env.action_space.high[0]],
            lr=LR_A)
        self.critic = Critic(
            sess, n_features=num_features, lr=LR_C
        )  # we need a good teacher, so the teacher should learn faster than the actor
        sess.run(tf.global_variables_initializer())

        self.replay_memory = []

    def get_action(self, state, episode_percentage):
        # Sometimes pick random action to explore
        if np.random.random() < self.get_exploration_prob(episode_percentage):
            return self.action_space.sample()
        else:
            return self.actor.choose_action(state)[0]

    def get_exploration_prob(self, episode_percentage):
        return -1 * (episode_percentage**2) + 1
        # return -1 * (episode_percentage - 1) ** 3

    def update(self, state, action, reward, new_state):
        td_error = self.critic.learn(
            state, reward,
            new_state)  # gradient = grad[r + gamma * V(s_) - V(s)]
        self.actor.learn(
            state, action,
            td_error)  # true_gradient = grad[logPi(s,a) * td_error]

        # Add to replay memory
        self.replay_memory.append((state, action, reward, new_state))
        if len(self.replay_memory) >= self.MEMORY_SIZE:
            self.replay_memory.pop(0)

        # Learn from replayed memories
        if np.random.random() < 0.5 and len(
                self.replay_memory) > self.BATCH_SIZE:
            minibatch = random.sample(self.replay_memory, self.BATCH_SIZE)
            for (batch_state, batch_action, batch_reward,
                 batch_new_state) in minibatch:
                td_error = self.critic.learn(batch_state, batch_reward,
                                             batch_new_state)
                self.actor.learn(batch_state, batch_action, td_error)

    def get_name(self):
        return 'ActorCritic_ExperienceReplay'
Exemple #29
0
class DDPGHedgingAgent:
    """DDPGAgent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        actor (nn.Module): target actor model to select actions
        actor_target (nn.Module): actor model to predict next actions
        actor_optimizer (Optimizer): optimizer for training actor
        critic (nn.Module): critic model to predict state values
        critic_target (nn.Module): target critic model to predict state values
        critic_optimizer (Optimizer): optimizer for training critic
        memory (ReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        gamma (float): discount factor
        tau (float): parameter for soft target update
        initial_random_episode (int): initial random action steps
        noise (OUNoise): noise generator for exploration
        device (torch.device): cpu / gpu
        transition (list): temporory storage for the recent transition
        total_step (int): total step numbers
        is_test (bool): flag to show the current mode (train / test)
    """
    def __init__(self,
                 env: gym.Env,
                 memory_size: int,
                 batch_size: int,
                 ou_noise_theta: float,
                 ou_noise_sigma: float,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 initial_random_episode: int = 1e4,
                 name_cases='myproject'):
        """ Initialize. """

        # Logger
        self.wandb = wandb.init(project=name_cases)

        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.env = env
        self.memory = ReplayBuffer(memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.initial_random_episode = initial_random_episode

        # noise
        self.noise = OUNoise(
            action_dim,
            theta=ou_noise_theta,
            sigma=ou_noise_sigma,
        )

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        print(self.device)

        # networks
        self.actor = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())

        self.critic = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # transition to store in memory
        self.transition = list()

        # total steps count
        self.total_step = 0
        # mode: train / test
        self.is_test = False
        self.populate(self.initial_random_episode)

    def populate(self, eps: int = 100) -> None:
        """
        Carries out several random steps through the environment to initially fill
        up the replay buffer with experiences

        Args:
            steps: number of random steps to populate the buffer with
        """

        if not self.is_test:
            print("Populate Replay Buffer... ")
            kbar = pkbar.Kbar(target=eps, width=20)
            state = self.env.reset()

            for i in range(eps):
                while True:
                    # Get action from sample space
                    selected_action = self.env.action_space.sample()
                    # selected_action = 0
                    noise = self.noise.sample()
                    selected_action = np.clip(selected_action + noise, -1.0,
                                              1.0)

                    next_state, reward, done, _ = self.env.step(
                        selected_action)
                    self.transition = [
                        state, selected_action, reward, next_state,
                        int(done)
                    ]
                    self.memory.append(Experience(*self.transition))

                    state = next_state
                    if done:
                        state = self.env.reset()
                        break

                kbar.add(1)

            # self.scaler = self.memory.standar_scaler()

    @torch.no_grad()
    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        state_s = self.scaler.transform([state])
        selected_action = self.actor(
            torch.FloatTensor(state_s).to(self.device)).item()
        # add noise for exploration during training
        if not self.is_test:
            noise = self.noise.sample()
            selected_action = np.clip(selected_action + noise, -1.0, 1.0)

        self.transition = [state, selected_action]
        return selected_action

    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]:
        """Take an action and return the response of the env."""
        next_state, reward, done, _ = self.env.step(action)

        if not self.is_test:
            self.transition += [reward, next_state, int(done)]
            self.memory.append(Experience(*self.transition))

        return next_state, reward, done

    def update_model(self) -> torch.Tensor:
        """ Update the model by gradient descent.
            Change the loss in to mean variance optimization
        """
        device = self.device  # for shortening the following lines

        state, action, reward, next_state, done = self.memory.sample(
            self.batch_size, self.device)

        state = torch.FloatTensor(self.scaler.transform(state)).to(device)
        next_state = torch.FloatTensor(
            self.scaler.transform(next_state)).to(device)
        # state = state.to(device)
        # next_state = next_state.to(device)
        action = action.to(device)
        reward = reward.to(device)
        done = done.to(device)

        masks = 1 - done
        next_action = self.actor_target(next_state)
        next_value = self.critic_target(next_state, next_action)
        curr_return = reward.reshape(
            -1, 1) + self.gamma * next_value * masks.reshape(-1, 1)

        # train critic
        values = self.critic(state, action)
        critic_loss = F.mse_loss(values, curr_return)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in self.critic.parameters():
            p.requires_grad = False

        # train actor
        q_values = self.critic(state, self.actor(state))
        actor_loss = -q_values.mean()
        # actor_loss = 0.5 * q_values.std() ** 2

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        for p in self.critic.parameters():
            p.requires_grad = True

        # target update
        self._target_soft_update()

        return actor_loss.data, critic_loss.data

    def train(self, num_frames: int, plotting_interval: int = 200):
        """Train the agent."""
        self.is_test = False

        state = self.env.reset()
        actor_losses = []
        critic_losses = []
        scores = []
        score = 0

        print("Training...")
        kbar = pkbar.Kbar(target=num_frames, width=20)

        for self.total_step in range(1, num_frames + 1):
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

            # if episode ends
            if done:
                state = self.env.reset()
                scores.append(score)
                score = 0

                self._plot(
                    self.total_step,
                    scores,
                    actor_losses,
                    critic_losses,
                )

            # if training is ready
            if (len(self.memory) >= self.batch_size):  # and
                actor_loss, critic_loss = self.update_model()
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

            kbar.add(1)

        self.env.close()

    def test(self):
        """Test the agent."""
        self.is_test = True

        state = self.env.reset()
        done = False
        score = 0

        while not done:
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward

        self.env.close()

        return score

    def _target_soft_update(self):
        """Soft-update: target = tau*local + (1-tau)*target."""
        tau = self.tau

        for t_param, l_param in zip(self.actor_target.parameters(),
                                    self.actor.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)

        for t_param, l_param in zip(self.critic_target.parameters(),
                                    self.critic.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)

    def _plot(
        self,
        frame_idx: int,
        scores: List[float],
        actor_losses: List[float],
        critic_losses: List[float],
    ):
        """Plot the training progresses."""

        self.wandb.log({
            'frame': frame_idx,
            'score': scores[-1],
            'actor_loss': actor_losses[-1],
            'critic_loss': critic_losses[-1]
        })
Exemple #30
0
class DDPGAGENT:
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPS

        #--- actor -----#

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=1e-3)

        #---- critic -----#

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=1e-3,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #self.timestep = 0

    def step(self, state, action, reward, next_state, done, timestep):
        self.memory.add_experience(state, action, reward, next_state, done)

        #self.timestep = (self.timestep + 1) % UPDATE_EVERY

        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0:
            for _ in range(LEARN_NUM):
                xp = self.memory.sample()
                self.learn(xp, GAMMA)  #GAMMA VALUE 0.99

    def act(self, state, noise_accumulate=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        #Epsilon greedy selection
        if noise_accumulate:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset_internal_state()

    def learn(self, xp, gamma):
        states, actions, rewards, next_states, dones = xp

        #---configuring critic and computation of loss with help of MSE

        actions_nxt = self.actor_target(next_states)

        q_target_next = self.critic_target(next_states, actions_nxt)

        q_target = rewards + (gamma * q_target_next * (1 - dones))

        q_expected = self.critic_local(states, actions)

        #MSE LOSS
        critic_loss = F.mse_loss(q_expected, q_target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clips gradient norm of an iterable of parameters
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        #---configuring actor and computation of loss with help of MSE
        actor_predicted = self.actor_local(states)
        actor_loss = -self.critic_local(states, actor_predicted).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        self.epsilon -= 1e-6
        self.noise.reset_internal_state()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)