Esempio n. 1
0
class Agent(object):
    def __init__(self, state_size, action_size, max_action, minibatch_size,
                 a_lr, c_lr, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action

        self.critic_lr = c_lr
        self.actor_lr = a_lr

        self.actor_network = Actor(self.state_size, self.action_size,
                                   self.max_action, self.actor_lr)
        self.actor_target_network = Actor(self.state_size, self.action_size,
                                          self.max_action, self.actor_lr)
        self.critic_network = Critic(self.state_size, self.action_size,
                                     self.critic_lr)
        self.critic_target_network = Critic(self.state_size, self.action_size,
                                            self.critic_lr)

        self.actor_target_network.set_weights(self.actor_network.get_weights())
        self.critic_target_network.set_weights(
            self.critic_network.get_weights())

        self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr)
        self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr)

        self.replay_buffer = ReplayBuffer(1e6)
        self.MINIBATCH_SIZE = minibatch_size
        self.GAMMA = tf.cast(gamma, dtype=tf.float64)
        self.TAU = tau
        self.noise = OUNoise(self.action_size)

    def step(self, s, a, r, s_1, t, train=True):
        self.replay_buffer.add(s, a, r, s_1, t)
        if (train and self.replay_buffer.size() >= self.MINIBATCH_SIZE):
            minibatch = self.replay_buffer.sample_batch(self.MINIBATCH_SIZE)
            self.learn(minibatch)

    @tf.function
    def critic_train(self, minibatch):
        s_batch, a_batch, r_batch, s_1_batch, t_batch = minibatch

        mu_prime = self.actor_target_network(s_1_batch)
        q_prime = self.critic_target_network([s_1_batch, mu_prime])

        ys = r_batch + self.GAMMA * (1 - t_batch) * q_prime

        with tf.GradientTape() as tape:
            predicted_qs = self.critic_network([s_batch, a_batch])
            loss = (predicted_qs - ys) * (predicted_qs - ys)
            loss = tf.reduce_mean(loss)
        dloss = tape.gradient(loss, self.critic_network.trainable_weights)

        self.critic_optimizer.apply_gradients(
            zip(dloss, self.critic_network.trainable_weights))

    def actor_train(self, minibatch):
        s_batch, _, _, _, _ = minibatch

        with tf.GradientTape() as tape:
            next_action = self.actor_network(s_batch)
            actor_loss = -tf.reduce_mean(
                self.critic_network([s_batch, next_action]))
        actor_grad = tape.gradient(actor_loss,
                                   self.actor_network.trainable_weights)

        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_network.trainable_weights))

    def learn(self, minibatch):
        s, a, r, s_1, t = minibatch

        s = np.array(s, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                  self.state_size)
        s = tf.convert_to_tensor(s)
        a = np.array(a, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                  self.action_size)
        a = tf.convert_to_tensor(a)
        r = np.array(r, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1)
        s_1 = np.array(s_1, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                      self.state_size)
        s_1 = tf.convert_to_tensor(s_1)
        t = np.array(t, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1)

        minibatch = (s, a, r, s_1, t)

        self.critic_train(minibatch)
        self.actor_train(minibatch)
        self.update_target_networks()

    def act(self, state, t=0):
        state = np.array(state).reshape(1, self.state_size)
        action = self.actor_network(state)[0]
        noisy = self.noise.get_action(action, t)
        return action, noisy

    def update_target_networks(self):
        self.actor_target_network.set_weights(
            np.array(self.actor_network.get_weights()) * self.TAU +
            np.array(self.actor_target_network.get_weights()) * (1 - self.TAU))
        self.critic_target_network.set_weights(
            np.array(self.critic_network.get_weights()) * self.TAU +
            np.array(self.critic_target_network.get_weights()) *
            (1 - self.TAU))
            # Select action randomly or according to policy
            if t < args.start_timesteps:
                action = env.sample(group_name)
                # print(f"Sampled action: {action}")
            else:
                action = (policy.select_action(np.array(state)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

            # Perform action
            next_state, rewards, done = env.step(action, group_name)
            reward, Rsim, Robs, Rcstr = unpack_rewards(rewards)

            # Store data in replay buffer
            replay_buffer.add(state, action, next_state, reward, float(done))
            state = next_state
            episode_reward += reward
            episode_Rsim += Rsim
            episode_Robs += Robs
            episode_Rcstr += Rcstr

            # Train agent after collecting sufficient data
            if t >= args.start_timesteps:
                policy.train(replay_buffer, args.batch_size)

            if done:
                # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
                print(f"Total T: {t+1} Episode Num: {episode_num+1} \
                        Episode T: {episode_timesteps} Reward: {episode_reward:.3f} \
                        Rsim: {episode_Rsim:.3f} Robs: {episode_Robs:.3f} Rcstr: {episode_Rcstr:.3f} \
Esempio n. 3
0
class Enemy():
    def __init__(self, x, y, size, state_size, action_size, seed, mass=1):
        self.x = x
        self.y = y
        self.size = size
        self.colour = (0, 0, 255)
        self.thickness = 0
        self.speed = 0
        self.angle = 0
        self.mass = mass
        self.drag = (self.mass /
                     (self.mass + Constants.MASS_OF_AIR))**self.size
        ####################################
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size).to(Constants.DEVICE)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size).to(Constants.DEVICE)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=Constants.LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, Constants.BUFFER_SIZE,
                                   Constants.BATCH_SIZE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        #######################################

    def display(self, screen):
        pygame.draw.circle(screen, self.colour, (int(self.x), int(self.y)),
                           self.size, self.thickness)

    def move(self):
        self.x += math.sin(self.angle) * self.speed
        self.y -= math.cos(self.angle) * self.speed
        self.speed *= self.drag

    def bounce(self, soccerfield):
        if self.x > Constants.SIZE_WIDTH - self.size:
            self.x = 2 * (Constants.SIZE_WIDTH - self.size) - self.x
            self.angle = -self.angle
            self.speed *= Constants.ELASTICITY
        elif self.x < self.size:
            self.x = 2 * self.size - self.x
            self.angle = -self.angle
            self.speed *= Constants.ELASTICITY

        if self.y > Constants.SIZE_HEIGHT - self.size:
            self.y = 2 * (Constants.SIZE_HEIGHT - self.size) - self.y
            self.angle = math.pi - self.angle
            self.speed *= Constants.ELASTICITY
        elif self.y < self.size:
            self.y = 2 * self.size - self.y
            self.angle = math.pi - self.angle
            self.speed *= Constants.ELASTICITY

        if self.x > int((19 * Constants.SIZE_WIDTH) / 20):
            if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3):
                self.y = 2 * (Constants.SIZE_HEIGHT / 3 -
                              self.size) - self.y - 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY
            elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3):
                self.y = 2 * self.size - self.y + 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY
        elif self.x < int(Constants.SIZE_WIDTH / 20):
            if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3):
                self.y = 2 * (Constants.SIZE_HEIGHT / 3 -
                              self.size) - self.y - 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY
            elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3):
                self.y = 2 * self.size - self.y + 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY

        for i in range(4):
            dx = self.x - soccerfield.goalposts[i].x
            dy = self.y - soccerfield.goalposts[i].y
            dist = math.hypot(dx, dy)
            if dist < self.size + soccerfield.goalposts[i].size:
                angle = math.atan2(dy, dx) + 0.5 * math.pi
                total_mass = self.mass + 9999
                (self.angle, self.speed) = self.addVectors(
                    self.angle,
                    self.speed * (self.mass - 9999) / total_mass, angle, 0)
                self.speed *= Constants.ELASTICITY
                overlap = 0.5 * (self.size + soccerfield.goalposts[i].size -
                                 dist + 1)
                self.x += math.sin(angle) * overlap
                self.y -= math.cos(angle) * overlap
                break

    '''
        0 -> shoot
        1 -> up + left
        2 -> up + right
        3 -> down + left
        4 -> down + right
        5 -> up 
        6 -> down
        7 -> left
        8 -> right
     '''

    def update(self, action, ball):

        if action == 0 and self.control_ball(ball):
            dx = -(self.x - ball.x) / 6
            dy = -(self.y - ball.y) / 6
            ball.angle = 0.5 * math.pi + math.atan2(dy, dx)
            ball.speed = math.hypot(dx, dy)
        if action == 1:
            dx = -Constants.UPDATE_DOUBLE_DXY
            dy = -Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 2:
            dx = Constants.UPDATE_DOUBLE_DXY
            dy = -Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 3:
            dx = -Constants.UPDATE_DOUBLE_DXY
            dy = Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 4:
            dx = Constants.UPDATE_DOUBLE_DXY
            dy = Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 5:
            dx = 0
            dy = -Constants.UPDATE_SINGLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 6:
            dx = 0
            dy = Constants.UPDATE_SINGLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 7:
            dx = -Constants.UPDATE_SINGLE_DXY
            dy = 0
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 8:
            dx = Constants.UPDATE_SINGLE_DXY
            dy = 0
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)

    def control_ball(self, ball):
        dx = self.x - ball.x
        dy = self.y - ball.y
        dist = math.hypot(dx, dy)
        if dist - 3 < self.size + ball.size:
            return True
        return False

    def addVectors(self, angle1, length1, angle2, length2):
        x = math.sin(angle1) * length1 + math.sin(angle2) * length2
        y = math.cos(angle1) * length1 + math.cos(angle2) * length2

        angle = 0.5 * math.pi - math.atan2(y, x)
        length = math.hypot(x, y)

        return (angle, length)

    ###################
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % Constants.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > Constants.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, Constants.GAMMA)

    def act(self, state, eps=0.):
        # Returns actions for given state as per current policy.

        state = torch.from_numpy(state).float().unsqueeze(0).to(
            Constants.DEVICE)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #

        self.soft_update(self.qnetwork_local, self.qnetwork_target,
                         Constants.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class AgentDDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Noise process
        self.mu = 0
        self.theta = 0.15
        self.sigmaStart = 0.5
        self.sigmaEnd = 0.1
        self.decayExponent = 0.01
        self.noise = OUNoise(self.action_size, self.mu, self.theta,
                             self.sigmaStart, self.sigmaEnd,
                             self.decayExponent)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.0001  # for soft update of target parameters
        self.learningRateActor = 0.00005
        self.learningRateCritic = 0.0005
        self.dropoutActor = 0.1
        self.dropoutCritic = 0.1

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learningRate=self.learningRateActor,
                                 dropoutRate=self.dropoutActor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learningRate=self.learningRateActor,
                                  dropoutRate=self.dropoutActor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learningRate=self.learningRateCritic,
                                   dropoutRate=self.dropoutCritic,
                                   l2Lambda=1e-2)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learningRate=self.learningRateCritic,
                                    dropoutRate=self.dropoutCritic,
                                    l2Lambda=1e-2)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.rewardSum = 0

    def reset_episode(self):
        self.rewardSum = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.rewardSum += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        noise = self.noise.sample()
        return list(action + noise), noise  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 5
0
    def train(self, sess, actor, critic, actor_noise, buffer_size,
              minibatch_size):

        # Set up summary Ops
        summary_ops, summary_vars = self.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter("./results", sess.graph)

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()

        # Initialize replay memory
        replay_buffer = ReplayBuffer(int(buffer_size), int(1234))

        for i in range(self.max_episodes):

            # s = env.reset()
            self.pub2.publish()
            # print "reset called"
            ep_reward = 0
            ep_ave_max_q = 0

            for j in range(self.episode_length):
                if j == 0:
                    # print "first round"
                    s, R = self.getstate([0, 0])
                    R = 0
                    self.lstate = s
                    print R
                    continue

                # Added exploration noise
                #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))

                #********************************wait for state here for the first pass or initis

                ##*******************************interact here with environment
                #perform action and wait for new state
                # a=np.array([0.1,0])
                a = actor.predict(np.reshape(
                    s, (1, actor.s_dim))) + actor_noise()
                # print a[0]
                s2, R = self.getstate(a[0])
                print R

                replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                                  np.reshape(a, (actor.a_dim, )),
                                  R, self.terminal,
                                  np.reshape(s2, (actor.s_dim, )))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > int(minibatch_size):
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                     replay_buffer.sample_batch(int(minibatch_size))

                    # Calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(int(minibatch_size)):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + critic.gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(y_i, (int(minibatch_size), 1)))

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

                # s = s2
                s = s2
                self.lstate = s
                ep_reward += R

                if self.terminal == 1:
                    self.terminal = 0
                    # print "terminal!!!!!!!!!"
                    summary_str = sess.run(summary_ops,
                                           feed_dict={
                                               summary_vars[0]:
                                               ep_reward,
                                               summary_vars[1]:
                                               ep_ave_max_q / float(j)
                                           })

                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                            i, (ep_ave_max_q / float(j))))
                    break
Esempio n. 6
0
class stateMsg():

	def __init__(self):

		# self.moveNet = moveCNN()
		self.state = np.zeros(14);
		self.lstate = np.zeros(14);
		self.stateT=torch.FloatTensor()
		self.sub = rospy.Subscriber('/state', Floats, self.fetch)
		self.pub = rospy.Publisher('/cmd_vel_mux/input/navi',Twist,queue_size=10)
		self.fpass=1
		self.move_cmd = Twist()
		self.move_cmd.linear.x = 0
		self.move_cmd.angular.z = 0
		self.max_episodes=20000
		self.episode_length=20 #maybe change later
		self.num_episodes=0
		self.terminal=0
		self.rBuf=ReplayBuffer()

	def train(self,states):
		states=torch.unsqueeze(states, 0)
		X = Variable(states.clone().cpu())
		actions=actor.forward(X)
		Q=critic.forward(X,actions)
		return Q, actions


	def fetch(self,msg):
		if self.num_episodes<self.max_episodes:
			self.state=np.array(msg.data)
			self.state=np.concatenate((self.state,np.array([self.move_cmd.linear.x,self.move_cmd.angular.z])))
			self.stateT=torch.from_numpy(self.state).type(dtype)
			# Q,actions=self.train(self.stateT)
			states=torch.unsqueeze(self.stateT, 0)
			X = Variable(states.clone().cpu())
			print X
			actions=actor.forward(X)
			action=actions.data.numpy()
			self.move_cmd.linear.x = action[0][0]
			self.move_cmd.angular.z = action[0][1]
			self.pub.publish(self.move_cmd)
			if self.fpass==0:
				R=self.reward()
				self.rBuf.add(self.lstate,action,R,self.terminal,self.state)
				if self.rBuf.size()>5:
					s_batch, a_batch, r_batch, t_batch, s2_batch = self.rBuf.sample_batch(5)
					
				# Q=critic.forward(X,actions)
			if self.fpass==1:
				self.fpass=0	
			self.lstate=self.state

	def reward(self):
		dist=self.state[10]
		ldist=self.lstate[10]
		# print dist
		if dist<0.2:
			R=10
			self.terminal=1
			self.num_episodes+=1
		elif dist==1234:
			R=-100
			self.terminal=1
			self.num_episodes+=1
			# print "hit"
		else:
			R=0.1*(ldist-dist)

		return R
Esempio n. 7
0
class DDPGController(object):
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()

    def train(self):
        # print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

            # if self.time_step % 10000 == 0:
            # self.actor_network.save_network(self.time_step)
            # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

    def initial_train(self, mini_batch):
        state_batch = np.asarray([data[0] for data in mini_batch])
        action_batch = np.asarray([data[1] for data in mini_batch])
        action_label_batch = np.asarray([data[2] for data in mini_batch])
        value_label_batch = np.asarray([data[3] for data in mini_batch])
        done_batch = np.asarray([data[4] for data in mini_batch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])
        action_label_batch = np.resize(action_label_batch,
                                       [BATCH_SIZE, self.action_dim])

        # Calculate y_batch
        y_batch = []
        for i in range(len(mini_batch)):
            y_batch.append(value_label_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        critic_cost = self.critic_network.train(y_batch, state_batch,
                                                action_label_batch)

        # Update the actor policy using the sampled gradient:
        # action_batch_for_gradients = self.actor_network.actions(state_batch)
        # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients)

        # self.actor_network.train(q_gradient_batch, state_batch)
        action_cost = self.actor_network.initial_train(
            action_label_batch=action_label_batch, state_batch=state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()
        return critic_cost, action_cost

    def save_model(self, path, check_point):
        self.model_saver.save(self.sess,
                              path + 'DDPGControllerModel.ckpt',
                              global_step=check_point)
        print("Model saved at " + path + 'model.ckpt')

    def load_model(self, path):
        self.model_saver.restore(self.sess, path)
        print("Model loaded at " + path)
        pass
Esempio n. 8
0
class Agent:
    """ Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, batch_size, buffer_size, gamma,
                 lr):
        """ Initialize an Agent.

        @param state_size: (int) dimension of each state (= n)
        @param action_size: (int) dimension of each action (= n), select maximum as action
        @param batch_size: (int) mini-batch size
        @param buffer_size: replay-buffer size
        @param gamma: discount factor
        @param lr: learning rate
        """
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.state_goal_size = 2 * state_size  # state+goal = 2n
        self.action_size = action_size

        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.lr = lr

        # Q-Network
        self.qnetwork_local = QNetwork(self.state_goal_size,
                                       action_size).to(self.device)
        self.qnetwork_target = QNetwork(self.state_goal_size,
                                        action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size)

    def store_episode(self, states, actions, rewards, next_states, dones):
        """ Store episode to replay buffer for standard experience replay.

        @param states: (list of dicts) containing 'obs' and 'goal' (is stored as s||g in memory)
        @param actions: list of actions in episode
        @param rewards: list of rewards received in episode
        @param next_states: list of next states (is stored as ns||g in memory)
        @param dones: boolean indicating end of episode
        """
        # normal experience replay, store experiences
        state_goals = [np.concatenate([i['obs'], i['goal']]) for i in states]
        next_state_goals = [
            np.concatenate([i['obs'], i['goal']]) for i in next_states
        ]
        for (sg, a, r, nsg, d) in zip(state_goals, actions, rewards,
                                      next_state_goals, dones):
            self.memory.add(sg, a, r, nsg, d)

    def store_episode_HER(self,
                          states,
                          actions,
                          next_states,
                          replay_strategy='final',
                          k=4):
        """ Store episode with HER samples if replay_strategy is set to 'final', 'future' or 'episode'.

        @param states: (list of dicts) containing 'obs' and 'goal' (is stored as s||g in memory)
        @param actions: list of actions
        @param next_states: list of next states (is stored as ns||g in memory)
        @param replay_strategy: if 'future' HER samples are added to the buffer
        @param k: number of goals in one episode for HER
        """
        T = len(actions)
        n_bits = len(states[0]['obs'])

        if replay_strategy is 'final':
            # HER 'final' replay strategy ---------------------------------------------------------
            # substitute goal as final state of episode
            goal_her = next_states[-1]['obs']

            for t in range(T):
                state_goal = np.concatenate((states[t]['obs'], goal_her))
                next_state_goal = np.concatenate(
                    (next_states[t]['obs'], goal_her))
                # recompute reward and done
                done = np.sum(
                    np.array(next_states[t]['obs']) == np.array(
                        goal_her)) == n_bits
                reward = 0 if done else -1
                self.memory.add(state_goal, actions[t], reward,
                                next_state_goal, done)

        if replay_strategy is 'future':
            # HER 'future' replay strategy ---------------------------------------------------------
            for t in range(T):
                for k in range(k):
                    future_idx = np.random.randint(
                        t, T
                    )  # select random index from future experience in episode
                    # set goal as next_state from future index
                    goal_her = next_states[future_idx]['obs']
                    state_goal = np.concatenate([states[t]['obs'], goal_her])
                    next_state_goal = np.concatenate(
                        [next_states[t]['obs'], goal_her])
                    # recompute reward and done
                    done = np.sum(
                        np.array(next_states[t]['obs']) == np.array(
                            goal_her)) == n_bits
                    reward = 0 if done else -1
                    self.memory.add(state_goal, actions[t], reward,
                                    next_state_goal, done)

        if replay_strategy is 'episode':
            # HER 'episode' replay strategy ---------------------------------------------------------
            for t in range(T):
                for k in range(k):
                    episode_idx = np.random.randint(
                        0, T)  # select random index from current episode
                    # set goal as random (next) state in episode
                    goal_her = next_states[episode_idx]['obs']
                    state_goal = np.concatenate([states[t]['obs'], goal_her])
                    next_state_goal = np.concatenate(
                        [next_states[t]['obs'], goal_her])
                    # recompute reward and done
                    done = np.sum(
                        np.array(next_states[t]['obs']) == np.array(
                            goal_her)) == n_bits
                    reward = 0 if done else -1
                    self.memory.add(state_goal, actions[t], reward,
                                    next_state_goal, done)

    def act(self, state_goal, eps=0.):
        """ Returns actions for given state as per current policy

        @param state_goal: (array_like) current state
        @param eps: (float) epsilon, for epsilon-greedy action selection
        @return: (int) action is the index of the bit to flip, value in [0, n-1]
        """
        state_goal = torch.from_numpy(state_goal).float().unsqueeze(0).to(
            self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state_goal)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """ Update value parameters using given batch of experience tuples."""
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()

            # compute and minimize the loss

            state_goals, actions, rewards, next_state_goals, dones = experiences

            # update rule
            Q_targets = rewards + \
                        self.gamma * self.qnetwork_target(next_state_goals).max(1)[0].unsqueeze(1) * (1 - dones)

            Q_expected = self.qnetwork_local(state_goals).gather(1, actions)

            # MSE loss
            loss = F.mse_loss(Q_expected, Q_targets)

            # optimization
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters:
        θ_target = τ*θ_local + (1 - τ)*θ_target

        @param local_model: local pytorch model
        @param target_model: target pytorch model
        @param tau: soft update of target network, 1-tau = polyak coefficient
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
def train(sess, env, args, actor, critic, actor_noise):
    # Load ckpt file
    if args['load_ckpts']:
        print("Loading checkpoints")
        loader = tf.compat.v1.train.Saver()
        if args['ckpts_file'] is not None:
            ckpt = args['ckpts_dir'] + '/' + args['ckpts_file']
        else:
            ckpt = tf.train.latest_checkpoint(args['ckpts_dir'])
        loader.restore(sess, ckpt)
        sys.stdout.write('%s restored.\n\n' % ckpt)
        sys.stdout.flush()
        ckpt_split = ckpt.split('-')
        train_ep = ckpt_split[-1]
    else:
        print("Starting new training")
        sess.run(tf.compat.v1.global_variables_initializer())
        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        train_ep = 0

    # Define saver for saving model ckpts
    model_name = str(env) + '.ckpt'
    checkpoint_path = os.path.join(args['ckpts_dir'], model_name)
    if not os.path.exists(args['ckpts_dir']):
        os.makedirs(args['ckpts_dir'])
    saver = tf.compat.v1.train.Saver()

    # Setup Summary
    summary_ops, summary_vars = build_summaries()

    # sess.run(tf.compat.v1.global_variables_initializer())

    # Initialize target network weights
    # actor.update_target_network()
    # critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    for i in range(int(train_ep) + 1, int(args['max_episodes']) + 1):

        s = env.reset()
        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Add exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal, _ = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                # Find argmax q value of the current episode
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            csv_write = [i, ep_reward, ep_ave_max_q]

            if terminal:
                if (summary_ops != None):
                    summary_str = sess.run(summary_ops,
                                           feed_dict={
                                               summary_vars[0]:
                                               ep_reward,
                                               summary_vars[1]:
                                               ep_ave_max_q / float(j)
                                           })

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                        i, (ep_ave_max_q / float(j))))
                break

        if (i % int(args['ckpts_step']) == 0):
            saver.save(sess, checkpoint_path, i)
            sys.stdout.write('Checkpoint saved \n')
            sys.stdout.flush()

        with open('result/rewards.csv', mode='a', newline='') as output_file:
            output_writer = csv.writer(output_file, lineterminator='\n')
            output_writer.writerow(csv_write)
Esempio n. 10
0
class Agent_DQN:
    def __init__(self,
                 action_size,
                 state_size,
                 learning_rate=0.01,
                 discount_factor=0.9,
                 epsilon_initial=1,
                 epsilon_decay=0.995,
                 batch_size=32):

        # 생성자에 다양한 변수 지정하기
        self.action_size = action_size
        self.state_size = state_size

        # LR 과 global step을 지정한다.
        self.global_step = tf.Variable(0, trainable=False)

        # decayed_learning_rate = learning_rate *
        #                         decay_rate ^ (global_step / decay_steps)
        self.learning_rate = tf.train.exponential_decay(learning_rate,
                                                        self.global_step,
                                                        100,
                                                        0.9999,
                                                        staircase=False,
                                                        name='learning_rate')

        # Discount factor 도 지정해 준다.
        self.gamma = discount_factor

        # epsilon greedy 방식으로 탐험을 할 것이므로 epsilon 과 decay를 정해준다.
        self.epsilon = epsilon_initial
        self.epsilon_decay = epsilon_decay

        # 배치 사이즈 정하기
        self.batch_size = batch_size
        self.learning_iteration = 0

        # 메모리 정의해주기. 메모리에는 s,a,r,s_ 를 저장해야 한다. 따라서 s, s_ 저장공간, a, r을 위한 저장공간을 만든다.
        self.memory_size = 2000
        self.replayBuffer = ReplayBuffer(self.memory_size)

        # 두가지 네트워크를 정의해서 하나는 Fixed Q-target으로 사용한다.
        self.build_evaluation_network()
        self.build_target_network()

        # target net과 eval net의 파라미터를 모아준다. scope의 상위 directory를 이용해서 모아줄 수 있다.
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tn')
        self.t_params = t_params
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='en')

        # tf assign을 이용하면 특정 텐서 변수값에 다른 하나를 넣을 수 있다.
        self.replace_target_op = [
            tf.assign(t, (1 - 0.03) * t + 0.03 * e)
            for t, e in zip(t_params, e_params)
        ]

        # 세션을 정의한다.
        self.sess = tf.Session()

        # initializer 실행
        self.sess.run(tf.global_variables_initializer())
        self.loss_history = []
        self.learning_rate_history = []

    def build_evaluation_network(self):
        '''
        eval net을 만들 땐 target net과는 다르게 loss를 구하는 net이 추가되어야 함.
        target net 은 fixed Q-target을 위해서 쓰는 것이지 업데이트를 하지 않는다. 때문에 이 eval net만 tarinable = Ture 로 설정되어야 함.
        :return:
        '''
        # evaluation net 으로 들어갈 data 를 넣을 placeholder 이다.
        self.eval_input = tf.placeholder(tf.float32, [None, self.state_size],
                                         name='eval_input')

        #  self.y 와 self.a 는 placeholder 로써, loss 를 구하기 위한 placeholder 이다.
        self.y = tf.placeholder(tf.float32, [None], name='Q_target')
        self.a = tf.placeholder(tf.int64, [None], name='action')

        #  실제 네트워크
        with tf.variable_scope('en'):
            hidden1 = tf.layers.dense(
                self.eval_input,
                10,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer1',
                trainable=True)
            self.q_eval = tf.layers.dense(
                hidden1,
                self.action_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer2',
                trainable=True)

        # loss를 구하는 부분
        with tf.variable_scope('loss'):
            self.a_one_hot = tf.one_hot(self.a, depth=self.action_size)
            self.q_predict = tf.reduce_sum(tf.multiply(self.q_eval,
                                                       self.a_one_hot),
                                           axis=1)
            self.loss = tf.reduce_mean(
                tf.squared_difference(self.y, self.q_predict))
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.learning_rate)\
                .minimize(self.loss, global_step=self.global_step)

    def build_target_network(self):
        self.target_input = tf.placeholder(tf.float32, [None, self.state_size],
                                           name='target_input')
        with tf.variable_scope('tn'):
            hidden1 = tf.layers.dense(
                self.target_input,
                10,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer1',
                trainable=False)
            self.get_q_target = tf.layers.dense(
                hidden1,
                self.action_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.random_normal_initializer(0., 0.5),
                bias_initializer=tf.random_normal_initializer(0., 0.1),
                name='layer2',
                trainable=False)

    def store_transition(self, s, a, r, s_):
        self.replayBuffer.add(s, a, r, s_)

    def get_action(self, observation):
        '''
        x : 카트 위치
        dx/dt : 카트 속도
        θ : 막대기 각도
        dθ/dt : 각속도
        이 함수는 epsilon 값에 따라 Neural Network 또는 임의의 값 하나를 action으로 선택하여 return 한다.
        '''
        if np.random.uniform() > self.epsilon:
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.eval_input: [observation]})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.action_size)
        return action

    def learn(self):
        '''
        인공신경망의 업데이트가 이루어지는 함수
        '''
        # 메모리를 적당히 채우면 learn 하고 그렇지 않으면 learn을 생략한다.
        if self.learning_iteration >= self.memory_size:
            # eval_net 과 fixed_q_target을 적절한 비율로 교체해준다.
            self.sess.run(self.replace_target_op)

            batch = self.replayBuffer.get_batch(self.batch_size)
            batch_s = np.asarray([x[0] for x in batch])
            batch_a = np.asarray([x[1] for x in batch])
            batch_r = np.asarray([x[2] for x in batch])
            batch_s_ = np.asarray([x[3] for x in batch])

            # q_eval 은 현재 Q함수값을 구하기 위해, get_q_target은 max함수에 포함되어있는 Q값을 구하기 위해 사용한다.
            get_q_target, q_eval = self.sess.run(
                [self.get_q_target, self.q_eval],
                feed_dict={
                    self.target_input: batch_s_,  # fixed params
                    self.eval_input: batch_s,  # newest params
                })

            # action 은 배치 메모리에서 state가 저장된 다음부분부터가 action이므로 그 값을 가져오면 된다.
            a = batch_a
            # reward는 action 다음에 저장했으므로 그 다음 값을 가져오면 된다.
            reward = batch_r
            # self.y placeholder에 넣어줄 값을 위에서 구한 값으로 적절히 만들어서 넣는다.
            _, self.loss_out = self.sess.run(
                [self._train_op, self.loss],
                feed_dict={
                    self.eval_input: batch_s,
                    self.y: reward + self.gamma * np.max(get_q_target, axis=1),
                    self.a: a
                })
            self.loss_history.append(self.loss_out)

            # epsilon -greedy 탐험을 하기 위해 epsilon 값을 주기적으로 낮춰주어야한다.
            self.epsilon = self.epsilon * self.epsilon_decay

        # iteration을 세어주기 위한 변수, 러닝레이트 출력을 위해 히스토리에 하나씩 추가해본다.
        self.learning_iteration += 1
        self.learning_rate_history.append(self.sess.run([self.learning_rate]))

    def plot_loss(self):
        # 파이썬에서 Times New Roman 글씨체를 이용하여 그래프를 출력할 수 있음!
        plt.title('History')
        ms = 0.1
        me = 1
        line_width = 0.5
        plt.ylabel('Loss')
        plt.xlabel('Training steps')
        plt.plot(np.arange(len(self.loss_history)),
                 self.loss_history,
                 '--^',
                 color='r',
                 markevery=me,
                 label=r'critic loss',
                 lw=line_width,
                 markersize=ms)
        plt.grid()
        ax = plt.subplot(111)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.ylim(0, 2)
        plt.show()

    def plot_reward(self, reward_history):
        plt.plot(np.arange(len(reward_history)), reward_history)
        plt.grid()
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.show()
Esempio n. 11
0
class Agent():
    def __init__(self, q_network, buffer_size, batch_size, update_every, gamma, tau, lr,  seed):

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.qnetwork_local = copy.deepcopy(q_network)
        self.qnetwork_target = copy.deepcopy(q_network)
        self.seed = random.seed(seed)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed)
        
        self.temperature = 1
        self.t_step = 0
        ########################
        self.qnetwork_local = self.qnetwork_local.to(device)
        self.qnetwork_target = self.qnetwork_target.to(device)

    def get_Q(self, state):
        return self.qnetwork_local.Q(state)
    
    def reset_memory(self):
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed)

    def predict_option_termination(self, state, current_option):
        state = torch.tensor(state).float().to(device)
        state = self.qnetwork_local.state(state)
        termination = self.qnetwork_local.terminations(state).softmax(dim = -1)
        termination = termination[current_option]
        #termination = self.qnetwork_local.terminations(state)[current_option].sigmoid()
        option_termination = Bernoulli(termination).sample()
        Q = self.get_Q(state)
        next_option = Q.argmax(dim=-1)
        return bool(option_termination.item()), next_option.item()
    
    def get_terminations(self, state):
        return self.qnetwork_local.terminations(state).softmax(dim = -1)

    def greedy_option(self, state):
        state = to_tensor(state).to(device) 
        state = self.qnetwork_local.state(state)
        Q = self.get_Q(state)
        return Q.argmax(dim=-1).item()
    
    def step(self, state, current_option, reward, next_state, done, logp, entropy):
        self.memory.add(state, current_option, reward, next_state, done)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if (len(self.memory)) > BATCH_SIZE:
            if self.t_step == 0:
                actor_loss_val = actor_loss(state, current_option, logp, entropy,                 reward, done, next_state, self.qnetwork_local, self.qnetwork_target)
                loss = actor_loss_val        
                samples = self.memory.sample()
                self.learn(samples, self.gamma, loss) # critic loss [td error]

    def act(self, state, eps, option, eval_mode = True, pa = []):
        state = to_tensor(state).to(device)
        state = self.qnetwork_local.state(state)
        logits = state @ self.qnetwork_local.options_W[option].to(device) + self.qnetwork_local.options_b[option].to(device)
        if eval_mode:
            for i in range(len(logits)):
                if i not in pa:
                    logits[i] = - float("inf")
        action_dist = (logits / self.temperature).softmax(dim=-1) # high temp makes softmax output closer        
        action_dist = Categorical(action_dist)                 # like multinomial dist
        action = action_dist.sample()
        if not pa :
          action = torch.randint(0, NUM_LINES*NUM_LINES,(1,))
          logp = 0
          entropy = 0
          return action, logp, entropy
        #action = torch.argmax(logits)
        logp = action_dist.log_prob(action)
        entropy = action_dist.entropy()
        return action.item(), logp, entropy
    
    #for test you need to write in the script itself to choose epsilon option

    def learn(self, samples, gamma, loss):
        states, options, rewards, next_states, dones = samples
        critic_loss_val = critic_loss(self.qnetwork_local, self.qnetwork_target, samples)
        loss += critic_loss_val
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.qnetwork_local.to('cpu')
        self.qnetwork_target.to('cpu')
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(TAU*local_param.data + (1.0-TAU)*target_param.data)
        self.qnetwork_local.to(device)
        self.qnetwork_target.to(device)

    # return buffer_size, batch_size, update_every, gamma, tau
    def get_stats(self):
        return self.buffer_size, self.batch_size, self.update_every, self.gamma, self.tau, self.lr
Esempio n. 12
0
class Agent():
    def __init__(self, q_network, buffer_size, batch_size, update_every, gamma, tau, lr,  seed):

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.qnetwork_local = copy.deepcopy(q_network)
        self.qnetwork_target = copy.deepcopy(q_network)
        self.seed = random.seed(seed)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed)

        self.t_step = 0

    def reset_memory(self):
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.seed)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if (len(self.memory)) > self.batch_size:
                samples = self.memory.sample()
                self.learn(samples, self.gamma)

    def act(self, state, eps=0, eval_mode = True, pa = []):
        state = torch.tensor(state).float()
        with torch.no_grad():
            action_values = self.qnetwork_local(state).numpy()

        if eval_mode:
            if random.random() > eps:
                for i in range(len(action_values)):
                    if i not in pa:
                        action_values[i] = - float("inf")
                return np.argmax(action_values)
            else:
                return random.choice(pa)
        else:
            if random.random() > eps:
                return np.argmax(action_values)
            else:
                return random.choice(range(len(action_values)))

    def learn(self, samples, gamma):
        states, actions, rewards, next_states, dones = samples

        q_values_next_states = self.qnetwork_target.forward(next_states).max(dim=1)[0]  # .unsqueeze(1)
        targets = rewards + (gamma * (q_values_next_states) * (1 - dones))
        q_values = self.qnetwork_local.forward(states)

        actions = actions.view(actions.size()[0], 1)
        predictions = torch.gather(q_values, 1, actions).view(actions.size()[0])

        loss = F.mse_loss(predictions, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

    # return buffer_size, batch_size, update_every, gamma, tau
    def get_stats(self):
        return self.buffer_size, self.batch_size, self.update_every, self.gamma, self.tau, self.lr
Esempio n. 13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       fc_units=FC_UNITS).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        fc_units=FC_UNITS).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Collect experience and learn from it.

        Params
        ======
            state (array_like): current state
            action(int): current action
            reward(float): current reward
            next_state(array_like): next state
            done (bool): is episode over?
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        # Epsilon-greedy action selection
        if np.random.random() > eps:
            return int(np.argmax(action_values.cpu().data.numpy()))
        else:
            return np.random.randint(self.action_size)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 14
0
class Agent_DDPG(object):
    def __init__(
        self,
        action_size,
        state_size,
        action_limit,
    ):
        self.memory_size = 10000
        self.replayBuffer = ReplayBuffer(self.memory_size)
        self.sess = tf.Session()

        self.discount_factor = 0.9
        self.action_variance = 3
        self.critic_learning_rate = 0.001
        self.actor_learning_rate = 0.002
        self.batch_size = 32

        self.action_size, self.state_size, self.action_limit = action_size, state_size, action_limit,
        self.input_state = tf.placeholder(tf.float32, [None, state_size], 's')
        self.input_state_ = tf.placeholder(tf.float32, [None, state_size],
                                           's_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self.build_actor_network(self.input_state,
                                              scope='eval',
                                              trainable=True)
            a_ = self.build_actor_network(self.input_state_,
                                          scope='tar',
                                          trainable=False)
        with tf.variable_scope('Critic'):
            q_eval = self.build_critic_network(self.input_state,
                                               self.a,
                                               scope='eval',
                                               trainable=True)
            q_target = self.build_critic_network(self.input_state_,
                                                 a_,
                                                 scope='target',
                                                 trainable=False)

        self.actor_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.actor_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/tar')
        self.critic_evaluation_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.critic_target_params = tf.get_collection(
            key=tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/tar')

        self.replace = [
            tf.assign(t, (1 - 0.01) * t + 0.01 * e) for t, e in zip(
                self.actor_target_params +
                self.critic_target_params, self.actor_evaluation_params +
                self.critic_evaluation_params)
        ]
        '''
               dJ/dtheta = E[ dQ/dtheta ] 

        '''
        # Actor Loss 는 Q로부터 내려오는 값을 maximize 하면 된다(논문 참조)
        self.a_loss = tf.reduce_mean(q_eval)  # maximize the q
        # Maximize Q 를 해야하므로 learning rate에 '-' 를 붙인다.
        self.atrain = tf.train.AdamOptimizer(
            -self.actor_learning_rate).minimize(
                tf.reduce_mean(q_eval), var_list=self.actor_evaluation_params)

        # self.c_train 을 호출할때 self.a 에 배치의 action을 넣게 된다.
        # Placeholder가 아닌 self.a 에 직접 값을 대입하는 것!
        # s a r s_ 를 이용해서 critic을 업데이트 하는데, 정석으로 구한 y가 트루 라벨, 뉴럴넷에 값을 넣고 나오는 것이 우리의 prediction이다.
        # True Label,  y = r(s,u_t(s)) + gamma*Q(s_, u_t(s_))
        q_true = self.R + self.discount_factor * q_target

        # Prediction, Q = q_eval
        # 우리가 mseLoss를 구하려면 q_eval을 구해야 하므로 self.input_state에 피딩을 해 주어야 함.
        # 또한 q_true 를 구하기 위해 self.R 과 q_target에 들어갈 self.input_state_ 도 피딩 해주어야 함.
        self.mseloss = tf.losses.mean_squared_error(labels=q_true,
                                                    predictions=q_eval)
        # 이 부분은 오직 Critic net을 업데이트하기위한 Loss이다. 때문에 var_list를 Critic evaluation network로 지정해주어야한다.
        self.ctrain = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(
                self.mseloss, var_list=self.critic_evaluation_params)

        # 네트워크를 만들고 항상 초기화를 해준다.
        self.sess.run(tf.global_variables_initializer())

        self.actor_loss_history = []
        self.critic_loss_history = []

    def store_transition(self, s, a, r, s_):
        self.replayBuffer.add(s, a, r, s_)

    def choose_action(self, s):
        return np.clip(
            np.random.normal(
                self.sess.run(self.a, {self.input_state: s[np.newaxis, :]})[0],
                self.action_variance), -2, 2)

    def learn(self):
        if self.replayBuffer.count() > self.batch_size:
            self.action_variance *= .9995
            self.sess.run(self.replace)

            batch = self.replayBuffer.get_batch(self.batch_size)
            batch_s = np.asarray([x[0] for x in batch])
            batch_a = np.asarray([x[1] for x in batch])
            batch_r = np.asarray([[x[2]] for x in batch])
            batch_s_ = np.asarray([x[3] for x in batch])

            actor_loss, _ = self.sess.run([self.a_loss, self.atrain],
                                          {self.input_state: batch_s})
            critic_loss, _ = self.sess.run(
                [self.mseloss, self.ctrain], {
                    self.input_state: batch_s,
                    self.a: batch_a,
                    self.R: batch_r,
                    self.input_state_: batch_s_
                })

            self.actor_loss_history.append(actor_loss)
            self.critic_loss_history.append(critic_loss)

    def build_actor_network(self, s, scope, trainable):
        actor_hidden_size = 30
        with tf.variable_scope(scope):
            hidden1 = tf.layers.dense(s,
                                      actor_hidden_size,
                                      activation=tf.nn.relu,
                                      name='l1',
                                      trainable=trainable)
            a = tf.layers.dense(hidden1,
                                self.action_size,
                                activation=tf.nn.tanh,
                                name='a',
                                trainable=trainable)
            return tf.multiply(a, self.action_limit, name='scaled_a')

    def build_critic_network(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            critic_hidden_size = 30
            hidden1 = tf.layers.dense(s, critic_hidden_size,  name='s1', trainable=trainable) \
            + tf.layers.dense(a, critic_hidden_size, name='a1', trainable=trainable) \
            + tf.get_variable('b1', [1, critic_hidden_size], trainable=trainable)
            hidden1 = tf.nn.relu(hidden1)
            return tf.layers.dense(hidden1, 1, trainable=trainable)

    def plot_loss(self):
        plt.title('history', fontsize=25)
        ms = 0.1
        me = 1
        line_width = 0.1
        plt.ylabel('Loss')
        plt.xlabel('Training steps')

        actor_loss_mean = sum(self.actor_loss_history) / len(
            self.actor_loss_history)
        self.actor_loss_history /= actor_loss_mean
        critic_loss_mean = sum(self.critic_loss_history) / len(
            self.critic_loss_history)
        self.critic_loss_history /= critic_loss_mean

        plt.plot(np.arange(len(self.actor_loss_history)),
                 self.actor_loss_history,
                 '-p',
                 color='b',
                 markevery=me,
                 label=r'actor loss',
                 lw=line_width,
                 markersize=ms)
        plt.plot(np.arange(len(self.critic_loss_history)),
                 self.critic_loss_history,
                 '--^',
                 color='r',
                 markevery=me,
                 label=r'critic loss',
                 lw=line_width,
                 markersize=ms)

        plt.grid()
        ax = plt.subplot(111)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.ylim(0, 10)
        plt.show()

    def plot_reward(self, reward_history):
        plt.plot(np.arange(len(reward_history)), reward_history)
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.grid()
        plt.show()
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(random_seed)
        self.device = Utils.getDevice()

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.steps = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        self.steps += 1
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
            self.steps = 0

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the noise.
        """
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        self.update_critic(states, actions, rewards, next_states, dones, gamma)

        # ---------------------------- update actor ---------------------------- #
        self.update_actor(states)

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_critic(self, states, actions, rewards, next_states, dones,
                      gamma):
        """ update critic 
        Params
        ======
            states:  current state
            actions:  actions to performe
            next_states: next state
            dones : episode finished
        """
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

    def update_actor(self, states):
        """ update actor  
        Params
        ======
            states:  current state
        """

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()