def __init__(self, state_size, action_size, seed, alpha, gamma, tau): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.alpha = alpha self.gamma = gamma self.tau = tau # Q Learning Network self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.alpha) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, fc_units=FC_UNITS).to(device) self.qnetwork_target = QNetwork(state_size, action_size, fc_units=FC_UNITS).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, x, y, size, state_size, action_size, seed, mass=1): self.x = x self.y = y self.size = size self.colour = (0, 0, 255) self.thickness = 0 self.speed = 0 self.angle = 0 self.mass = mass self.drag = (self.mass / (self.mass + Constants.MASS_OF_AIR))**self.size #################################### self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(Constants.DEVICE) self.qnetwork_target = QNetwork(state_size, action_size).to(Constants.DEVICE) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=Constants.LR) # Replay memory self.memory = ReplayBuffer(action_size, Constants.BUFFER_SIZE, Constants.BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau): self.state_size = state_size self.action_size = action_size self.max_action = max_action self.critic_lr = c_lr self.actor_lr = a_lr self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr) self.actor_target_network.set_weights(self.actor_network.get_weights()) self.critic_target_network.set_weights( self.critic_network.get_weights()) self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr) self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr) self.replay_buffer = ReplayBuffer(1e6) self.MINIBATCH_SIZE = minibatch_size self.GAMMA = tf.cast(gamma, dtype=tf.float64) self.TAU = tau self.noise = OUNoise(self.action_size)
def __init__(self, state_size, action_size, batch_size, buffer_size, gamma, lr): """ Initialize an Agent. @param state_size: (int) dimension of each state (= n) @param action_size: (int) dimension of each action (= n), select maximum as action @param batch_size: (int) mini-batch size @param buffer_size: replay-buffer size @param gamma: discount factor @param lr: learning rate """ self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.state_goal_size = 2 * state_size # state+goal = 2n self.action_size = action_size self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.lr = lr # Q-Network self.qnetwork_local = QNetwork(self.state_goal_size, action_size).to(self.device) self.qnetwork_target = QNetwork(self.state_goal_size, action_size).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size)
def __init__(self, env, config): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 1. self.lr = 1e-3 self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C)
class Agent: def __init__(self, env, config, wt): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 self.wt = wt self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C, self.wt) #Random action during Practice def act_pre(self): a = np.random.randint(self.n_action) return a #Epsilon greedy action selection function def act(self, s): a = self.greedy_act( s) if np.random.random() > self.epsilon else np.random.randint( self.n_action) return a def greedy_act(self, s): return self.net.action(s) #Practice without recording experiences def practice(self): self.lr = 1e-3 #possible self.net.pre_train(self.buffer, self.lr) #Records experiences and calls training functions def record(self, s, a, r, d, it, pre): #Variable pre is used to differentiate practice from RL training. if pre: self.buffer.append(s, a, r, d) if it > self.C['pre_training_start']: if it % self.C['pre_train_freq'] == 0: self.lr = 1e-3 self.net.pre_train(self.buffer, self.lr) else: self.buffer.append(s, a, r, d) if it <= 5e5: self.epsilon = linear_interp(0, 5e5, it, 0.1, 1.0) else: self.epsilon = max(linear_interp(5e5, 10e6, it, 0.01, 0.1), 0.01) if it > self.C['training_start']: if it % self.C['train_freq'] == 0: self.lr = 1e-4 #Learning rate for RL training self.net.train(self.buffer, self.lr) if it % self.C['update_target_freq'] == 0: self.net.update_target_network()
def __init__(self, env, config): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 #Learning rate self.buffer = ReplayBuffer( self.C['max_size'], self.C['frame_stack']) #Memory for RL-Training self.buffer2 = ReplayBuffer( self.C['max_size'], self.C['frame_stack']) #Memory for Practice self.net = Net(self.n_state, self.n_action, self.C)
def __init__(self, input_size, output_size, training_mode, seed): self.seed = random.seed(seed) self.epsilon = self.EPSILON_MAX self.training_mode = training_mode self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.memory = ReplayBuffer(self.device, seed=seed) self.nn = NN(input_size, output_size, seed).to(self.device) if self.DOUBLE_DQN: self.target_nn = NN(input_size, output_size, seed).to(self.device) self.optimizer = optim.Adam(self.nn.parameters(), lr=self.ALPHA, amsgrad=False) self.loss_func = nn.MSELoss()
class Agent: def __init__(self, env, config, wt): self.C = config self.n_state = list(env.observation_space.shape) self.n_action = env.action_space.n self.epsilon = 0.99 self.lr = 1e-3 self.wt = wt self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack']) self.net = Net(self.n_state, self.n_action, self.C, self.wt) def act_pre(self): a = np.random.randint(self.n_action) return a def act(self, s): a = self.greedy_act( s) if np.random.random() > self.epsilon else np.random.randint( self.n_action) return a def greedy_act(self, s): return self.net.action(s) def record(self, s, a, r, d, it, pre): if pre: self.buffer.append(s, a, r, d) if it > self.C['pre_training_start']: if it % self.C['pre_train_freq'] == 0: self.lr = 1e-3 #possible self.net.pre_train(self.buffer, self.lr) else: self.buffer.append(s, a, r, d) if it <= 6e5: self.epsilon = linear_interp(0, 6e5, it, 0.1, 1.0) else: self.epsilon = max(linear_interp(6e5, 10e6, it, 0.01, 0.1), 0.01) if it > self.C['training_start']: if it % self.C['train_freq'] == 0: self.lr = 1e-4 self.net.train(self.buffer, self.lr) # print(Q) if it % self.C['update_target_freq'] == 0: self.net.update_target_network()
def __init__(self, alpha=0.2, input_dims=None, env=None, gamma=0.99, n_actions=None, max_size=10000000, batch_size=32, polyak=0.995, lr=1e-3): self.gamma = gamma self.alpha = alpha # Definition of the temperature parameter self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.polyak = polyak self.lossPi = [] self.lossQ = [] self.lossV = [] self.lr = lr """ Definition of the neural networks: 1 actor, 2 critics, 1 value and 1 target value""" # 2 ways of acting for estimating the value function: # 1) define and learn a specific neural network (the used one) # 2) evaluate the value of a certain state V(st) from the expected values of the difference between # the q function Q(st, at) minus the entropy log(at|st), with at in reference to a certain policy pi self.actor = ActorNetwork(input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(input_dims, name='value_net') self.target_value = ValueNetwork(input_dims, name="target_net") # Initialize the main V-network and the target V-network with the same parameters. for target_parameter, parameter in zip(self.target_value.parameters(), self.value.parameters()): target_parameter.data.copy_(parameter.data) # for simplicity i take the parameters of the Q networks and store in a unique variable CriticParameters = itertools.chain(self.critic_1.parameters(), self.critic_2.parameters()) # define the Adam optimizer for those parameters self.optimizerCritic = optim.Adam(CriticParameters, lr=self.critic_1.lr)
def __init__(self, q_network, buffer_size, batch_size, update_every, gamma, tau, lr, seed): self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.tau = tau self.lr = lr self.qnetwork_local = copy.deepcopy(q_network) self.qnetwork_target = copy.deepcopy(q_network) self.seed = random.seed(seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed) self.t_step = 0
def __init__(self): # self.moveNet = moveCNN() self.state = np.zeros(14); self.lstate = np.zeros(14); self.stateT=torch.FloatTensor() self.sub = rospy.Subscriber('/state', Floats, self.fetch) self.pub = rospy.Publisher('/cmd_vel_mux/input/navi',Twist,queue_size=10) self.fpass=1 self.move_cmd = Twist() self.move_cmd.linear.x = 0 self.move_cmd.angular.z = 0 self.max_episodes=20000 self.episode_length=20 #maybe change later self.num_episodes=0 self.terminal=0 self.rBuf=ReplayBuffer()
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.001 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters # tuning gamma between (0.95 - 0.99) self.gamma = 0.99 # discount factor # tuning tau around (0.001 - 0.01) self.tau = 0.005 # for soft update of target parameters self.best_score = -np.inf self.score = 0 self.step_count = 0
def __init__(self, board_size, gamma=0.9, buffer_size=3000, use_target_net=False): assert 0 <= gamma and gamma <= 1, "gamma should be in 0 to 1, got {}".format( gamma) self._board_size = board_size self._gamma = gamma self._buffer = ReplayBuffer(buffer_size) self._buffer_size = buffer_size self._input_shape = (self._board_size, self._board_size, 1) self._model = self.agent_model() self._use_target_net = use_target_net if (use_target_net): self._target_net = self.agent_model() self.update_target_net()
def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size random.seed(random_seed) self.device = Utils.getDevice() # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.steps = 0
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.state_dim self.action_dim = env.action_dim self.sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=True)) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.model_saver = tf.train.Saver()
def __init__(self, lr, inputChannels, stateShape, numActions, batchSize, epsilon=1.0, gamma=0.99, layer1Size=1024, layer2Size=512, maxMemSize=100000, epsMin=0.01, epsDecay=5e-4): self.lr = lr self.epsilon = epsilon self.epsMin = epsMin self.epsDecay = epsDecay self.gamma = gamma self.batchSize = batchSize self.actionSpace = list(range(numActions)) self.maxMemSize = maxMemSize self.memory = ReplayBuffer(maxMemSize, stateShape) self.deepQNetwork = DQNetwork(lr, inputChannels, numActions)
class DQAgent(): def __init__(self, lr, inputChannels, stateShape, numActions, batchSize, epsilon=1.0, gamma=0.99, layer1Size=1024, layer2Size=512, maxMemSize=100000, epsMin=0.01, epsDecay=5e-4): self.lr = lr self.epsilon = epsilon self.epsMin = epsMin self.epsDecay = epsDecay self.gamma = gamma self.batchSize = batchSize self.actionSpace = list(range(numActions)) self.maxMemSize = maxMemSize self.memory = ReplayBuffer(maxMemSize, stateShape) self.deepQNetwork = DQNetwork(lr, inputChannels, numActions) ''' REENABLE EPSILON GREEDY ''' def chooseAction(self, observation): if np.random.random() > self.epsilon: state = torch.tensor(observation).float().clone().detach() state = state.to(self.deepQNetwork.device) state = state.unsqueeze(0) policy = self.deepQNetwork(state) action = torch.argmax(policy).item() return action else: return np.random.choice(self.actionSpace) def storeMemory(self, state, action, reward, nextState, done): self.memory.storeMemory(state, action, reward, nextState, done) def learn(self): if self.memory.memCount < self.batchSize: return self.deepQNetwork.optimizer.zero_grad() stateBatch, actionBatch, rewardBatch, nextStateBatch, doneBatch = \ self.memory.sample(self.batchSize) stateBatch = torch.tensor(stateBatch).to(self.deepQNetwork.device) actionBatch = torch.tensor(actionBatch).to(self.deepQNetwork.device) rewardBatch = torch.tensor(rewardBatch).to(self.deepQNetwork.device) nextStateBatch = torch.tensor(nextStateBatch).to( self.deepQNetwork.device) doneBatch = torch.tensor(doneBatch).to(self.deepQNetwork.device) batchIndex = np.arange(self.batchSize, dtype=np.int64) actionQs = self.deepQNetwork(stateBatch)[batchIndex, actionBatch] allNextActionQs = self.deepQNetwork(nextStateBatch) nextActionQs = torch.max(allNextActionQs, dim=1)[0] nextActionQs[doneBatch] = 0.0 qTarget = rewardBatch + self.gamma * nextActionQs loss = self.deepQNetwork.loss(qTarget, actionQs).to(self.deepQNetwork.device) loss.backward() self.deepQNetwork.optimizer.step() if self.epsilon > self.epsMin: self.epsilon -= self.epsDecay
class Enemy(): def __init__(self, x, y, size, state_size, action_size, seed, mass=1): self.x = x self.y = y self.size = size self.colour = (0, 0, 255) self.thickness = 0 self.speed = 0 self.angle = 0 self.mass = mass self.drag = (self.mass / (self.mass + Constants.MASS_OF_AIR))**self.size #################################### self.state_size = state_size self.action_size = action_size # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(Constants.DEVICE) self.qnetwork_target = QNetwork(state_size, action_size).to(Constants.DEVICE) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=Constants.LR) # Replay memory self.memory = ReplayBuffer(action_size, Constants.BUFFER_SIZE, Constants.BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ####################################### def display(self, screen): pygame.draw.circle(screen, self.colour, (int(self.x), int(self.y)), self.size, self.thickness) def move(self): self.x += math.sin(self.angle) * self.speed self.y -= math.cos(self.angle) * self.speed self.speed *= self.drag def bounce(self, soccerfield): if self.x > Constants.SIZE_WIDTH - self.size: self.x = 2 * (Constants.SIZE_WIDTH - self.size) - self.x self.angle = -self.angle self.speed *= Constants.ELASTICITY elif self.x < self.size: self.x = 2 * self.size - self.x self.angle = -self.angle self.speed *= Constants.ELASTICITY if self.y > Constants.SIZE_HEIGHT - self.size: self.y = 2 * (Constants.SIZE_HEIGHT - self.size) - self.y self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif self.y < self.size: self.y = 2 * self.size - self.y self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY if self.x > int((19 * Constants.SIZE_WIDTH) / 20): if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3): self.y = 2 * (Constants.SIZE_HEIGHT / 3 - self.size) - self.y - 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3): self.y = 2 * self.size - self.y + 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif self.x < int(Constants.SIZE_WIDTH / 20): if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3): self.y = 2 * (Constants.SIZE_HEIGHT / 3 - self.size) - self.y - 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3): self.y = 2 * self.size - self.y + 1 self.angle = math.pi - self.angle self.speed *= Constants.ELASTICITY for i in range(4): dx = self.x - soccerfield.goalposts[i].x dy = self.y - soccerfield.goalposts[i].y dist = math.hypot(dx, dy) if dist < self.size + soccerfield.goalposts[i].size: angle = math.atan2(dy, dx) + 0.5 * math.pi total_mass = self.mass + 9999 (self.angle, self.speed) = self.addVectors( self.angle, self.speed * (self.mass - 9999) / total_mass, angle, 0) self.speed *= Constants.ELASTICITY overlap = 0.5 * (self.size + soccerfield.goalposts[i].size - dist + 1) self.x += math.sin(angle) * overlap self.y -= math.cos(angle) * overlap break ''' 0 -> shoot 1 -> up + left 2 -> up + right 3 -> down + left 4 -> down + right 5 -> up 6 -> down 7 -> left 8 -> right ''' def update(self, action, ball): if action == 0 and self.control_ball(ball): dx = -(self.x - ball.x) / 6 dy = -(self.y - ball.y) / 6 ball.angle = 0.5 * math.pi + math.atan2(dy, dx) ball.speed = math.hypot(dx, dy) if action == 1: dx = -Constants.UPDATE_DOUBLE_DXY dy = -Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 2: dx = Constants.UPDATE_DOUBLE_DXY dy = -Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 3: dx = -Constants.UPDATE_DOUBLE_DXY dy = Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 4: dx = Constants.UPDATE_DOUBLE_DXY dy = Constants.UPDATE_DOUBLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 5: dx = 0 dy = -Constants.UPDATE_SINGLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 6: dx = 0 dy = Constants.UPDATE_SINGLE_DXY self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 7: dx = -Constants.UPDATE_SINGLE_DXY dy = 0 self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) if action == 8: dx = Constants.UPDATE_SINGLE_DXY dy = 0 self.angle = 0.5 * math.pi + math.atan2(dy, dx) self.speed = math.hypot(dx, dy) def control_ball(self, ball): dx = self.x - ball.x dy = self.y - ball.y dist = math.hypot(dx, dy) if dist - 3 < self.size + ball.size: return True return False def addVectors(self, angle1, length1, angle2, length2): x = math.sin(angle1) * length1 + math.sin(angle2) * length2 y = math.cos(angle1) * length1 + math.cos(angle2) * length2 angle = 0.5 * math.pi - math.atan2(y, x) length = math.hypot(x, y) return (angle, length) ################### def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % Constants.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > Constants.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, Constants.GAMMA) def act(self, state, eps=0.): # Returns actions for given state as per current policy. state = torch.from_numpy(state).float().unsqueeze(0).to( Constants.DEVICE) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, Constants.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
# Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise kwargs["noise_clip"] = args.noise_clip kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./checkpoint/{policy_file}") replay_buffer = ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [] # evaluations = [eval_policy(policy, env, args.seed, group_name)] # state, done = env.reset(group_name), False episode_reward = 0 episode_Rsim = 0 episode_Robs = 0 episode_Rcstr = 0 episode_timesteps = 0 episode_num = 0 for group_name in [group_name]: state, done = env.reset(group_name), False
ENV_NAME = 'BreakoutDeterministic-v4' # Create environment game_wrapper = GameWrapper(ENV_NAME, MAX_NOOP_STEPS) print("The environment has the following {} actions: {}".format( game_wrapper.env.action_space.n, game_wrapper.env.unwrapped.get_action_meanings())) # Create agent MAIN_DQN = buildq_network(game_wrapper.env.action_space.n, LEARNING_RATE, input_shape=INPUT_SHAPE) TARGET_DQN = buildq_network(game_wrapper.env.action_space.n, input_shape=INPUT_SHAPE) replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE) agent = Agent(MAIN_DQN, TARGET_DQN, replay_buffer, game_wrapper.env.action_space.n, input_shape=INPUT_SHAPE) print('Loading model...') agent.load('save-01603987') print('Loaded') terminal = True eval_rewards = [] evaluate_frame_number = 0 for frame in range(EVAL_LENGTH):
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Noise process self.mu = 0 self.theta = 0.15 self.sigmaStart = 0.5 self.sigmaEnd = 0.1 self.decayExponent = 0.01 self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigmaStart, self.sigmaEnd, self.decayExponent) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.0001 # for soft update of target parameters self.learningRateActor = 0.00005 self.learningRateCritic = 0.0005 self.dropoutActor = 0.1 self.dropoutCritic = 0.1 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) self.critic_target = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.rewardSum = 0
class AgentDDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Noise process self.mu = 0 self.theta = 0.15 self.sigmaStart = 0.5 self.sigmaEnd = 0.1 self.decayExponent = 0.01 self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigmaStart, self.sigmaEnd, self.decayExponent) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.0001 # for soft update of target parameters self.learningRateActor = 0.00005 self.learningRateCritic = 0.0005 self.dropoutActor = 0.1 self.dropoutCritic = 0.1 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) self.critic_target = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.rewardSum = 0 def reset_episode(self): self.rewardSum = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.rewardSum += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] noise = self.noise.sample() return list(action + noise), noise # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(object): def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau): self.state_size = state_size self.action_size = action_size self.max_action = max_action self.critic_lr = c_lr self.actor_lr = a_lr self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr) self.actor_target_network.set_weights(self.actor_network.get_weights()) self.critic_target_network.set_weights( self.critic_network.get_weights()) self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr) self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr) self.replay_buffer = ReplayBuffer(1e6) self.MINIBATCH_SIZE = minibatch_size self.GAMMA = tf.cast(gamma, dtype=tf.float64) self.TAU = tau self.noise = OUNoise(self.action_size) def step(self, s, a, r, s_1, t, train=True): self.replay_buffer.add(s, a, r, s_1, t) if (train and self.replay_buffer.size() >= self.MINIBATCH_SIZE): minibatch = self.replay_buffer.sample_batch(self.MINIBATCH_SIZE) self.learn(minibatch) @tf.function def critic_train(self, minibatch): s_batch, a_batch, r_batch, s_1_batch, t_batch = minibatch mu_prime = self.actor_target_network(s_1_batch) q_prime = self.critic_target_network([s_1_batch, mu_prime]) ys = r_batch + self.GAMMA * (1 - t_batch) * q_prime with tf.GradientTape() as tape: predicted_qs = self.critic_network([s_batch, a_batch]) loss = (predicted_qs - ys) * (predicted_qs - ys) loss = tf.reduce_mean(loss) dloss = tape.gradient(loss, self.critic_network.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss, self.critic_network.trainable_weights)) def actor_train(self, minibatch): s_batch, _, _, _, _ = minibatch with tf.GradientTape() as tape: next_action = self.actor_network(s_batch) actor_loss = -tf.reduce_mean( self.critic_network([s_batch, next_action])) actor_grad = tape.gradient(actor_loss, self.actor_network.trainable_weights) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_network.trainable_weights)) def learn(self, minibatch): s, a, r, s_1, t = minibatch s = np.array(s, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.state_size) s = tf.convert_to_tensor(s) a = np.array(a, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.action_size) a = tf.convert_to_tensor(a) r = np.array(r, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1) s_1 = np.array(s_1, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.state_size) s_1 = tf.convert_to_tensor(s_1) t = np.array(t, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1) minibatch = (s, a, r, s_1, t) self.critic_train(minibatch) self.actor_train(minibatch) self.update_target_networks() def act(self, state, t=0): state = np.array(state).reshape(1, self.state_size) action = self.actor_network(state)[0] noisy = self.noise.get_action(action, t) return action, noisy def update_target_networks(self): self.actor_target_network.set_weights( np.array(self.actor_network.get_weights()) * self.TAU + np.array(self.actor_target_network.get_weights()) * (1 - self.TAU)) self.critic_target_network.set_weights( np.array(self.critic_network.get_weights()) * self.TAU + np.array(self.critic_target_network.get_weights()) * (1 - self.TAU))
} episodes = 301 lr = .001 gamma = .93 alpha = .001 epsilon = 1 tau = 2500 wait = 3000 batch_size = 32 maxLengthGame = 450 Qprincipal = QNetwork(lr) Qtarget = QNetwork(lr) Qtarget.model.set_weights(Qprincipal.model.get_weights()) rBuffer = ReplayBuffer(5000) count = 0 results = [] print(' Episode | Score | Loss | Rounds') for ep in range(episodes): loss = 0 bots = [Robot() for i in range(3)] game = Game(player_names=[bot.name for bot in bots]) epsilon = max(epsilon * .995, .1) winnings = 20000 for i in range(maxLengthGame): maxLengthGame -= 1 winnings -= 20 count += 1 if game.dieRoll == 7:
file_name = 'games/' + result + '_' + str(np.random.randint(500000)) with open(file_name, 'wb') as f: pkl.dump(game, f) replay_buffer.save_game(game) print('Thread: {}, Game: {}, Result {}, Reward {}'.format(threading.get_ident(), i, result, terminal_value)) if __name__ == '__main__': remote = False config = Config() replay_buffer = ReplayBuffer(config) network = Network(config, remote=remote) num_epochs = 1000000 for e in range(num_epochs): # Make network read-only so it can be run on multiple threads if not remote: network.graph.finalize() jobs = [] for _ in range(config.num_actors): job = SelfPlay() job.start()
self.lrScheduler.step() return loss.item() def evalMotionModel(self,dataBatch): self.MotionModel.eval() actualNextStates = dataBatch[1][0] predictedNextStates = self.MotionModel(dataBatch[0]) loss = self.criterion(actualNextStates,predictedNextStates) return loss.item() if __name__ == '__main__': # check if cuda available device = 'cuda:0' if torch.cuda.is_available() else 'cpu' writer = SummaryWriter() # load replay buffer cpuReplayBuffer = ReplayBuffer(loadDataPrefix='simData/',saveDataPrefix='simData/',chooseCPU = True) cpuReplayBuffer.loadData(matchLoadSize=True) outputDataSTD = standardizeData() outputDataSTD.getDistribution(cpuReplayBuffer.outputData[0]) cpuReplayBuffer.outputData[0] = outputDataSTD.whiten(cpuReplayBuffer.outputData[0]) data = cpuReplayBuffer.getRandBatch() inStateDim = data[0][0].shape[1] inMapDim = data[0][1].shape[2] inActionDim = data[0][2].shape[1] outStateDim = data[1][0].shape[1] # training/ neural network parameters learningRate = 0.01 lrDecay_stepSize = 3000 lrDecay_gamma = 0.9
if __name__ == "__main__": replayBufferLength = 500000 numParallelSims = 16 sims = [] # set up simulations for i in range(numParallelSims): if i == -1: physicsClientId = p.connect(p.GUI) else: physicsClientId = p.connect(p.DIRECT) sims.append(simController(physicsClientId=physicsClientId)) data = sims[0].controlLoopStep([0, 0]) replayBuffer = ReplayBuffer(replayBufferLength, data[0], data[1], saveDataPrefix='simData/', chooseCPU=True) sTime = time.time() executor = concurrent.futures.ProcessPoolExecutor() while not replayBuffer.bufferFilled: results = executor.map(runSim, sims) for result in results: for data in result: replayBuffer.addData(data[0], data[1]) print("replay buffer index: " + str(replayBuffer.bufferIndex) + ", rtf: " + str(replayBuffer.bufferIndex * 0.25 / (time.time() - sTime))) print("estimated time left: " + str((replayBufferLength - replayBuffer.bufferIndex) /