def __init__(self, state_size, action_size, seed, device, params): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.device = device self.params = params # Q-Network self.qnetwork_local = qn.QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = qn.QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=params['LR']) # Replay memory self.memory = rp.ReplayBuffer(action_size, params['BUFFER_SIZE'], params['BATCH_SIZE'], seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self): self.replay_buffer = replaybuffer.ReplayBuffer(5000) self.env = PendulumEnv() observation = self.env.reset() self.device = torch.device("cuda") #INSTANCIATE MODELS state_size = 3 action_size = 1 self.state_dreamer = models.StateDreamer(state_size, action_size) self.reward_dreamer = models.RewardDreamer(state_size) self.actor = models.Actor(state_size, action_size) self.critic = models.Critic(state_size, action_size) #put models on device self.state_dreamer.to(self.device) self.reward_dreamer.to(self.device) self.actor.to(self.device) self.critic.to(self.device) #create optimiser for each model self.state_dreamer_optimizer = optim.SGD( self.state_dreamer.parameters(), lr=0.01, momentum=0.9) self.reward_dreamer_optimizer = optim.SGD( self.reward_dreamer.parameters(), lr=0.01, momentum=0.9) self.actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.0001, momentum=0.9) self.critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.001, momentum=0.9)
def __init__(self, agent, environment, max_size, episodic=True): self.agent = agent self.environment = environment self.max_size = max_size self.replay_buffer = rb.ReplayBuffer(max_size, agent.state_dim, agent.action_dim) self.episodic = episodic
def __init__(self, state_dim, action_dim): self.replay_buffer = replaybuffer.ReplayBuffer() self.q_network = Q_Network(state_dim, action_dim,16) self.q_network_target = copy.deepcopy(self.q_network) self.q_network_optim = torch.optim.Adam( self.q_network.parameters(), lr=0.001) self.criterion = nn.MSELoss() self.state_dim = state_dim self.action_dim = action_dim self.discount = 0.995 self.tau = 0.05
def __init__(self, state_dim, goal_dim, action_dim): self.replay_buffer = replaybuffer.ReplayBuffer() self.q_network = Goal_Based_Q_Network(state_dim, goal_dim, action_dim,256) self.q_network_target = copy.deepcopy(self.q_network) self.q_network_optimizer = torch.optim.Adam( self.q_network.parameters(), lr=0.001, weight_decay=1e-3) self.criterion = nn.MSELoss() self.state_dim = state_dim self.goal_dim = goal_dim self.acion_dim = action_dim self.discount = 0.99 self.tau = 0.05
def test_append(self): max_buffer_size = 5 buffer = replaybuffer.ReplayBuffer((2, 2), max_buffer_size) # inputs old_state = np.arange(4).reshape((2, 2)) new_state = old_state * 10 action = 0 reward = 1 buffer.append(old_state, new_state, action, reward) self.assertEqual(np.all(buffer.old_state[0] == old_state), True) self.assertEqual(np.all(buffer.old_state[1] == old_state), False) self.assertEqual(np.all(buffer.new_state[0] == old_state), False) self.assertEqual(buffer.old_state.shape, (max_buffer_size, 2, 2))
def test_shuffle(self): # this should return true only half of the time max_buffer_size = 3 buffer = replaybuffer.ReplayBuffer((2, 2), max_buffer_size=max_buffer_size) old_state = np.arange(4).reshape((2, 2)) new_state = old_state * 10 action = 2 reward = 1 buffer.append(old_state, new_state, action, reward) buffer.append(old_state * 2, new_state * 2, action * 2, reward * 2) states_shuffled = np.array([old_state, old_state * 2]) np.random.shuffle(states_shuffled) buffer.shuffle() old_state, _, _, _ = buffer.next_batch(1) self.assertEqual(np.all(states_shuffled[0] == old_state), True)
def test_next_batch(self): max_buffer_size = 3 buffer = replaybuffer.ReplayBuffer((2, 2), max_buffer_size=max_buffer_size) # inputs old_state = np.arange(4).reshape((2, 2)) new_state = old_state * 10 action = 2 reward = 1 self.assertEqual(buffer.empty(), True) self.assertEqual(buffer.full(), False) # test indices self.assertEqual(buffer.write_idx, 0) buffer.append(old_state, new_state, action, reward) self.assertEqual(buffer.write_idx, 1) buffer.append(old_state * 2, new_state * 2, action * 2, reward * 2) # test empty flag self.assertEqual(buffer.empty(), False) self.assertEqual(buffer.full(), False) buffer.append(old_state * 3, new_state * 3, action * 3, reward * 3) self.assertEqual(buffer.empty(), False) self.assertEqual(buffer.full(), True) self.assertEqual(buffer.write_idx, 3) self.assertEqual(buffer.read_idx, 0) # rest next_batch results (remainder size) old_state, new_state, action, reward = buffer.next_batch(2) self.assertEqual(buffer.read_idx, 2) self.assertEqual(old_state.shape, (2, 2, 2)) self.assertEqual(new_state.shape, (2, 2, 2)) self.assertEqual(np.all(action == [2, 4]), True) self.assertEqual(np.all(reward == [1, 2]), True) self.assertEqual(buffer.empty(), False) old_state, new_state, action, reward = buffer.next_batch(2) self.assertEqual(buffer.empty(), True) # test next_batch on empty return buffer
def test_init(self): # dont use prepare buffer here buffer = replaybuffer.ReplayBuffer((2, 2), 2) self.assertEqual(buffer.old_state.shape, (2, 2, 2)) self.assertEqual(buffer.new_state.shape, (2, 2, 2))
from IPython.display import clear_output import matplotlib.pyplot as plt import cnndqn import replaybuffer USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) model = cnndqn.CnnDQN((8, 8), 8*8) optimizer = optim.Adam(model.parameters(), lr=0.00001) replay_initial = 300 replay_buffer = replaybuffer.ReplayBuffer(10000) epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) n = 8 # board size (even) board = [['0' for x in range(n)] for y in range(n)] # 8 directions dirx = [-1, 0, 1, -1, 1, -1, 0, 1] diry = [-1, -1, -1, 0, 0, 1, 1, 1] opt = 2