Example #1
0
    def __init__(self, state_size, action_size, seed, alpha, gamma, tau):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau

        # Q Learning Network
        self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.alpha)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Example #2
0
    def __init__(self, state_size, action_size):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       fc_units=FC_UNITS).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        fc_units=FC_UNITS).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Example #3
0
    def __init__(self, x, y, size, state_size, action_size, seed, mass=1):
        self.x = x
        self.y = y
        self.size = size
        self.colour = (0, 0, 255)
        self.thickness = 0
        self.speed = 0
        self.angle = 0
        self.mass = mass
        self.drag = (self.mass /
                     (self.mass + Constants.MASS_OF_AIR))**self.size
        ####################################
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size).to(Constants.DEVICE)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size).to(Constants.DEVICE)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=Constants.LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, Constants.BUFFER_SIZE,
                                   Constants.BATCH_SIZE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Example #4
0
    def __init__(self, state_size, action_size, max_action, minibatch_size,
                 a_lr, c_lr, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action

        self.critic_lr = c_lr
        self.actor_lr = a_lr

        self.actor_network = Actor(self.state_size, self.action_size,
                                   self.max_action, self.actor_lr)
        self.actor_target_network = Actor(self.state_size, self.action_size,
                                          self.max_action, self.actor_lr)
        self.critic_network = Critic(self.state_size, self.action_size,
                                     self.critic_lr)
        self.critic_target_network = Critic(self.state_size, self.action_size,
                                            self.critic_lr)

        self.actor_target_network.set_weights(self.actor_network.get_weights())
        self.critic_target_network.set_weights(
            self.critic_network.get_weights())

        self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr)
        self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr)

        self.replay_buffer = ReplayBuffer(1e6)
        self.MINIBATCH_SIZE = minibatch_size
        self.GAMMA = tf.cast(gamma, dtype=tf.float64)
        self.TAU = tau
        self.noise = OUNoise(self.action_size)
Example #5
0
    def __init__(self, state_size, action_size, batch_size, buffer_size, gamma,
                 lr):
        """ Initialize an Agent.

        @param state_size: (int) dimension of each state (= n)
        @param action_size: (int) dimension of each action (= n), select maximum as action
        @param batch_size: (int) mini-batch size
        @param buffer_size: replay-buffer size
        @param gamma: discount factor
        @param lr: learning rate
        """
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.state_goal_size = 2 * state_size  # state+goal = 2n
        self.action_size = action_size

        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.lr = lr

        # Q-Network
        self.qnetwork_local = QNetwork(self.state_goal_size,
                                       action_size).to(self.device)
        self.qnetwork_target = QNetwork(self.state_goal_size,
                                        action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size)
Example #6
0
 def __init__(self, env, config):
     self.C = config
     self.n_state = list(env.observation_space.shape)
     self.n_action = env.action_space.n
     self.epsilon = 1.
     self.lr = 1e-3
     self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
     self.net = Net(self.n_state, self.n_action, self.C)
Example #7
0
class Agent:
    def __init__(self, env, config, wt):
        self.C = config
        self.n_state = list(env.observation_space.shape)
        self.n_action = env.action_space.n
        self.epsilon = 0.99
        self.lr = 1e-3
        self.wt = wt
        self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.net = Net(self.n_state, self.n_action, self.C, self.wt)

    #Random action during Practice
    def act_pre(self):
        a = np.random.randint(self.n_action)
        return a
        #Epsilon greedy action selection function
    def act(self, s):
        a = self.greedy_act(
            s) if np.random.random() > self.epsilon else np.random.randint(
                self.n_action)
        return a

    def greedy_act(self, s):
        return self.net.action(s)
        #Practice without recording experiences
    def practice(self):
        self.lr = 1e-3  #possible
        self.net.pre_train(self.buffer, self.lr)

        #Records experiences and calls training functions
    def record(self, s, a, r, d, it, pre):

        #Variable pre is used to differentiate practice from RL training.
        if pre:
            self.buffer.append(s, a, r, d)
            if it > self.C['pre_training_start']:
                if it % self.C['pre_train_freq'] == 0:
                    self.lr = 1e-3
                    self.net.pre_train(self.buffer, self.lr)

        else:
            self.buffer.append(s, a, r, d)
            if it <= 5e5:

                self.epsilon = linear_interp(0, 5e5, it, 0.1, 1.0)
            else:

                self.epsilon = max(linear_interp(5e5, 10e6, it, 0.01, 0.1),
                                   0.01)

            if it > self.C['training_start']:
                if it % self.C['train_freq'] == 0:
                    self.lr = 1e-4  #Learning rate for RL training
                    self.net.train(self.buffer, self.lr)

                if it % self.C['update_target_freq'] == 0:
                    self.net.update_target_network()
Example #8
0
 def __init__(self, env, config):
     self.C = config
     self.n_state = list(env.observation_space.shape)
     self.n_action = env.action_space.n
     self.epsilon = 0.99
     self.lr = 1e-3  #Learning rate
     self.buffer = ReplayBuffer(
         self.C['max_size'], self.C['frame_stack'])  #Memory for RL-Training
     self.buffer2 = ReplayBuffer(
         self.C['max_size'], self.C['frame_stack'])  #Memory for Practice
     self.net = Net(self.n_state, self.n_action, self.C)
Example #9
0
	def __init__(self, input_size, output_size, training_mode, seed):
		self.seed = random.seed(seed)
		self.epsilon = self.EPSILON_MAX
		self.training_mode = training_mode
		self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
		self.memory = ReplayBuffer(self.device, seed=seed)
		self.nn = NN(input_size, output_size, seed).to(self.device)
		if self.DOUBLE_DQN:
			self.target_nn = NN(input_size, output_size, seed).to(self.device)
		self.optimizer = optim.Adam(self.nn.parameters(), lr=self.ALPHA, amsgrad=False)
		self.loss_func = nn.MSELoss()
Example #10
0
class Agent:
    def __init__(self, env, config, wt):
        self.C = config
        self.n_state = list(env.observation_space.shape)
        self.n_action = env.action_space.n
        self.epsilon = 0.99
        self.lr = 1e-3
        self.wt = wt
        self.buffer = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.buffer2 = ReplayBuffer(self.C['max_size'], self.C['frame_stack'])
        self.net = Net(self.n_state, self.n_action, self.C, self.wt)

    def act_pre(self):
        a = np.random.randint(self.n_action)
        return a

    def act(self, s):
        a = self.greedy_act(
            s) if np.random.random() > self.epsilon else np.random.randint(
                self.n_action)
        return a

    def greedy_act(self, s):
        return self.net.action(s)

    def record(self, s, a, r, d, it, pre):

        if pre:
            self.buffer.append(s, a, r, d)
            if it > self.C['pre_training_start']:
                if it % self.C['pre_train_freq'] == 0:
                    self.lr = 1e-3  #possible
                    self.net.pre_train(self.buffer, self.lr)

        else:
            self.buffer.append(s, a, r, d)
            if it <= 6e5:

                self.epsilon = linear_interp(0, 6e5, it, 0.1, 1.0)
            else:

                self.epsilon = max(linear_interp(6e5, 10e6, it, 0.01, 0.1),
                                   0.01)

            if it > self.C['training_start']:
                if it % self.C['train_freq'] == 0:
                    self.lr = 1e-4
                    self.net.train(self.buffer, self.lr)
                    # print(Q)

                if it % self.C['update_target_freq'] == 0:
                    self.net.update_target_network()
Example #11
0
    def __init__(self,
                 alpha=0.2,
                 input_dims=None,
                 env=None,
                 gamma=0.99,
                 n_actions=None,
                 max_size=10000000,
                 batch_size=32,
                 polyak=0.995,
                 lr=1e-3):
        self.gamma = gamma
        self.alpha = alpha  # Definition of the temperature parameter
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.polyak = polyak
        self.lossPi = []
        self.lossQ = []
        self.lossV = []
        self.lr = lr
        """ Definition of the neural networks: 1 actor, 2 critics, 1 value and 1 target value"""
        # 2 ways of acting for estimating the value function:
        # 1) define and learn a specific neural network (the used one)
        # 2) evaluate the value of a certain state V(st) from the expected values of the difference between
        # the q function Q(st, at) minus the entropy log(at|st), with at in reference to a certain policy pi

        self.actor = ActorNetwork(input_dims,
                                  n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(input_dims,
                                      n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(input_dims,
                                      n_actions=n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(input_dims, name='value_net')

        self.target_value = ValueNetwork(input_dims, name="target_net")

        # Initialize the main V-network and the target V-network with the same parameters.
        for target_parameter, parameter in zip(self.target_value.parameters(),
                                               self.value.parameters()):
            target_parameter.data.copy_(parameter.data)

        # for simplicity i take the parameters of the Q networks and store in a unique variable
        CriticParameters = itertools.chain(self.critic_1.parameters(),
                                           self.critic_2.parameters())
        # define the Adam optimizer for those parameters
        self.optimizerCritic = optim.Adam(CriticParameters,
                                          lr=self.critic_1.lr)
Example #12
0
    def __init__(self, q_network, buffer_size, batch_size, update_every, gamma, tau, lr,  seed):

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.qnetwork_local = copy.deepcopy(q_network)
        self.qnetwork_target = copy.deepcopy(q_network)
        self.seed = random.seed(seed)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed)

        self.t_step = 0
Example #13
0
	def __init__(self):

		# self.moveNet = moveCNN()
		self.state = np.zeros(14);
		self.lstate = np.zeros(14);
		self.stateT=torch.FloatTensor()
		self.sub = rospy.Subscriber('/state', Floats, self.fetch)
		self.pub = rospy.Publisher('/cmd_vel_mux/input/navi',Twist,queue_size=10)
		self.fpass=1
		self.move_cmd = Twist()
		self.move_cmd.linear.x = 0
		self.move_cmd.angular.z = 0
		self.max_episodes=20000
		self.episode_length=20 #maybe change later
		self.num_episodes=0
		self.terminal=0
		self.rBuf=ReplayBuffer()
Example #14
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.001
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        # tuning gamma between (0.95 - 0.99)
        self.gamma = 0.99  # discount factor
        # tuning tau around (0.001 - 0.01)
        self.tau = 0.005  # for soft update of target parameters

        self.best_score = -np.inf
        self.score = 0
        self.step_count = 0
Example #15
0
    def __init__(self,
                 board_size,
                 gamma=0.9,
                 buffer_size=3000,
                 use_target_net=False):
        assert 0 <= gamma and gamma <= 1, "gamma should be in 0 to 1, got {}".format(
            gamma)

        self._board_size = board_size
        self._gamma = gamma

        self._buffer = ReplayBuffer(buffer_size)
        self._buffer_size = buffer_size

        self._input_shape = (self._board_size, self._board_size, 1)
        self._model = self.agent_model()
        self._use_target_net = use_target_net
        if (use_target_net):
            self._target_net = self.agent_model()
            self.update_target_net()
Example #16
0
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        random.seed(random_seed)
        self.device = Utils.getDevice()

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.steps = 0
Example #18
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.state_dim
        self.action_dim = env.action_dim

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            log_device_placement=True))

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.model_saver = tf.train.Saver()
Example #19
0
    def __init__(self,
                 lr,
                 inputChannels,
                 stateShape,
                 numActions,
                 batchSize,
                 epsilon=1.0,
                 gamma=0.99,
                 layer1Size=1024,
                 layer2Size=512,
                 maxMemSize=100000,
                 epsMin=0.01,
                 epsDecay=5e-4):
        self.lr = lr
        self.epsilon = epsilon
        self.epsMin = epsMin
        self.epsDecay = epsDecay
        self.gamma = gamma
        self.batchSize = batchSize
        self.actionSpace = list(range(numActions))
        self.maxMemSize = maxMemSize

        self.memory = ReplayBuffer(maxMemSize, stateShape)
        self.deepQNetwork = DQNetwork(lr, inputChannels, numActions)
Example #20
0
class DQAgent():
    def __init__(self,
                 lr,
                 inputChannels,
                 stateShape,
                 numActions,
                 batchSize,
                 epsilon=1.0,
                 gamma=0.99,
                 layer1Size=1024,
                 layer2Size=512,
                 maxMemSize=100000,
                 epsMin=0.01,
                 epsDecay=5e-4):
        self.lr = lr
        self.epsilon = epsilon
        self.epsMin = epsMin
        self.epsDecay = epsDecay
        self.gamma = gamma
        self.batchSize = batchSize
        self.actionSpace = list(range(numActions))
        self.maxMemSize = maxMemSize

        self.memory = ReplayBuffer(maxMemSize, stateShape)
        self.deepQNetwork = DQNetwork(lr, inputChannels, numActions)

    '''
    REENABLE EPSILON GREEDY
    '''

    def chooseAction(self, observation):
        if np.random.random() > self.epsilon:
            state = torch.tensor(observation).float().clone().detach()
            state = state.to(self.deepQNetwork.device)
            state = state.unsqueeze(0)
            policy = self.deepQNetwork(state)
            action = torch.argmax(policy).item()
            return action
        else:
            return np.random.choice(self.actionSpace)

    def storeMemory(self, state, action, reward, nextState, done):
        self.memory.storeMemory(state, action, reward, nextState, done)

    def learn(self):
        if self.memory.memCount < self.batchSize:
            return

        self.deepQNetwork.optimizer.zero_grad()

        stateBatch, actionBatch, rewardBatch, nextStateBatch, doneBatch = \
            self.memory.sample(self.batchSize)
        stateBatch = torch.tensor(stateBatch).to(self.deepQNetwork.device)
        actionBatch = torch.tensor(actionBatch).to(self.deepQNetwork.device)
        rewardBatch = torch.tensor(rewardBatch).to(self.deepQNetwork.device)
        nextStateBatch = torch.tensor(nextStateBatch).to(
            self.deepQNetwork.device)
        doneBatch = torch.tensor(doneBatch).to(self.deepQNetwork.device)

        batchIndex = np.arange(self.batchSize, dtype=np.int64)

        actionQs = self.deepQNetwork(stateBatch)[batchIndex, actionBatch]
        allNextActionQs = self.deepQNetwork(nextStateBatch)
        nextActionQs = torch.max(allNextActionQs, dim=1)[0]
        nextActionQs[doneBatch] = 0.0
        qTarget = rewardBatch + self.gamma * nextActionQs

        loss = self.deepQNetwork.loss(qTarget,
                                      actionQs).to(self.deepQNetwork.device)
        loss.backward()
        self.deepQNetwork.optimizer.step()

        if self.epsilon > self.epsMin:
            self.epsilon -= self.epsDecay
Example #21
0
class Enemy():
    def __init__(self, x, y, size, state_size, action_size, seed, mass=1):
        self.x = x
        self.y = y
        self.size = size
        self.colour = (0, 0, 255)
        self.thickness = 0
        self.speed = 0
        self.angle = 0
        self.mass = mass
        self.drag = (self.mass /
                     (self.mass + Constants.MASS_OF_AIR))**self.size
        ####################################
        self.state_size = state_size
        self.action_size = action_size

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size).to(Constants.DEVICE)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size).to(Constants.DEVICE)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=Constants.LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, Constants.BUFFER_SIZE,
                                   Constants.BATCH_SIZE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        #######################################

    def display(self, screen):
        pygame.draw.circle(screen, self.colour, (int(self.x), int(self.y)),
                           self.size, self.thickness)

    def move(self):
        self.x += math.sin(self.angle) * self.speed
        self.y -= math.cos(self.angle) * self.speed
        self.speed *= self.drag

    def bounce(self, soccerfield):
        if self.x > Constants.SIZE_WIDTH - self.size:
            self.x = 2 * (Constants.SIZE_WIDTH - self.size) - self.x
            self.angle = -self.angle
            self.speed *= Constants.ELASTICITY
        elif self.x < self.size:
            self.x = 2 * self.size - self.x
            self.angle = -self.angle
            self.speed *= Constants.ELASTICITY

        if self.y > Constants.SIZE_HEIGHT - self.size:
            self.y = 2 * (Constants.SIZE_HEIGHT - self.size) - self.y
            self.angle = math.pi - self.angle
            self.speed *= Constants.ELASTICITY
        elif self.y < self.size:
            self.y = 2 * self.size - self.y
            self.angle = math.pi - self.angle
            self.speed *= Constants.ELASTICITY

        if self.x > int((19 * Constants.SIZE_WIDTH) / 20):
            if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3):
                self.y = 2 * (Constants.SIZE_HEIGHT / 3 -
                              self.size) - self.y - 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY
            elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3):
                self.y = 2 * self.size - self.y + 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY
        elif self.x < int(Constants.SIZE_WIDTH / 20):
            if int(self.y + self.size) == int(Constants.SIZE_HEIGHT / 3):
                self.y = 2 * (Constants.SIZE_HEIGHT / 3 -
                              self.size) - self.y - 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY
            elif int(self.y + self.size) == int(2 * Constants.SIZE_HEIGHT / 3):
                self.y = 2 * self.size - self.y + 1
                self.angle = math.pi - self.angle
                self.speed *= Constants.ELASTICITY

        for i in range(4):
            dx = self.x - soccerfield.goalposts[i].x
            dy = self.y - soccerfield.goalposts[i].y
            dist = math.hypot(dx, dy)
            if dist < self.size + soccerfield.goalposts[i].size:
                angle = math.atan2(dy, dx) + 0.5 * math.pi
                total_mass = self.mass + 9999
                (self.angle, self.speed) = self.addVectors(
                    self.angle,
                    self.speed * (self.mass - 9999) / total_mass, angle, 0)
                self.speed *= Constants.ELASTICITY
                overlap = 0.5 * (self.size + soccerfield.goalposts[i].size -
                                 dist + 1)
                self.x += math.sin(angle) * overlap
                self.y -= math.cos(angle) * overlap
                break

    '''
        0 -> shoot
        1 -> up + left
        2 -> up + right
        3 -> down + left
        4 -> down + right
        5 -> up 
        6 -> down
        7 -> left
        8 -> right
     '''

    def update(self, action, ball):

        if action == 0 and self.control_ball(ball):
            dx = -(self.x - ball.x) / 6
            dy = -(self.y - ball.y) / 6
            ball.angle = 0.5 * math.pi + math.atan2(dy, dx)
            ball.speed = math.hypot(dx, dy)
        if action == 1:
            dx = -Constants.UPDATE_DOUBLE_DXY
            dy = -Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 2:
            dx = Constants.UPDATE_DOUBLE_DXY
            dy = -Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 3:
            dx = -Constants.UPDATE_DOUBLE_DXY
            dy = Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 4:
            dx = Constants.UPDATE_DOUBLE_DXY
            dy = Constants.UPDATE_DOUBLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 5:
            dx = 0
            dy = -Constants.UPDATE_SINGLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 6:
            dx = 0
            dy = Constants.UPDATE_SINGLE_DXY
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 7:
            dx = -Constants.UPDATE_SINGLE_DXY
            dy = 0
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)
        if action == 8:
            dx = Constants.UPDATE_SINGLE_DXY
            dy = 0
            self.angle = 0.5 * math.pi + math.atan2(dy, dx)
            self.speed = math.hypot(dx, dy)

    def control_ball(self, ball):
        dx = self.x - ball.x
        dy = self.y - ball.y
        dist = math.hypot(dx, dy)
        if dist - 3 < self.size + ball.size:
            return True
        return False

    def addVectors(self, angle1, length1, angle2, length2):
        x = math.sin(angle1) * length1 + math.sin(angle2) * length2
        y = math.cos(angle1) * length1 + math.cos(angle2) * length2

        angle = 0.5 * math.pi - math.atan2(y, x)
        length = math.hypot(x, y)

        return (angle, length)

    ###################
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % Constants.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > Constants.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, Constants.GAMMA)

    def act(self, state, eps=0.):
        # Returns actions for given state as per current policy.

        state = torch.from_numpy(state).float().unsqueeze(0).to(
            Constants.DEVICE)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #

        self.soft_update(self.qnetwork_local, self.qnetwork_target,
                         Constants.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise
        kwargs["noise_clip"] = args.noise_clip
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./checkpoint/{policy_file}")

    replay_buffer = ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = []
    # evaluations = [eval_policy(policy, env, args.seed, group_name)]

    # state, done = env.reset(group_name), False
    episode_reward = 0
    episode_Rsim = 0
    episode_Robs = 0
    episode_Rcstr = 0
    episode_timesteps = 0
    episode_num = 0

    for group_name in [group_name]:
        state, done = env.reset(group_name), False
Example #23
0
ENV_NAME = 'BreakoutDeterministic-v4'

# Create environment
game_wrapper = GameWrapper(ENV_NAME, MAX_NOOP_STEPS)
print("The environment has the following {} actions: {}".format(
    game_wrapper.env.action_space.n,
    game_wrapper.env.unwrapped.get_action_meanings()))

# Create agent
MAIN_DQN = buildq_network(game_wrapper.env.action_space.n,
                          LEARNING_RATE,
                          input_shape=INPUT_SHAPE)
TARGET_DQN = buildq_network(game_wrapper.env.action_space.n,
                            input_shape=INPUT_SHAPE)

replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE)
agent = Agent(MAIN_DQN,
              TARGET_DQN,
              replay_buffer,
              game_wrapper.env.action_space.n,
              input_shape=INPUT_SHAPE)

print('Loading model...')
agent.load('save-01603987')
print('Loaded')

terminal = True
eval_rewards = []
evaluate_frame_number = 0

for frame in range(EVAL_LENGTH):
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Noise process
        self.mu = 0
        self.theta = 0.15
        self.sigmaStart = 0.5
        self.sigmaEnd = 0.1
        self.decayExponent = 0.01
        self.noise = OUNoise(self.action_size, self.mu, self.theta,
                             self.sigmaStart, self.sigmaEnd,
                             self.decayExponent)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.0001  # for soft update of target parameters
        self.learningRateActor = 0.00005
        self.learningRateCritic = 0.0005
        self.dropoutActor = 0.1
        self.dropoutCritic = 0.1

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learningRate=self.learningRateActor,
                                 dropoutRate=self.dropoutActor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learningRate=self.learningRateActor,
                                  dropoutRate=self.dropoutActor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learningRate=self.learningRateCritic,
                                   dropoutRate=self.dropoutCritic,
                                   l2Lambda=1e-2)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learningRate=self.learningRateCritic,
                                    dropoutRate=self.dropoutCritic,
                                    l2Lambda=1e-2)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.rewardSum = 0
class AgentDDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Noise process
        self.mu = 0
        self.theta = 0.15
        self.sigmaStart = 0.5
        self.sigmaEnd = 0.1
        self.decayExponent = 0.01
        self.noise = OUNoise(self.action_size, self.mu, self.theta,
                             self.sigmaStart, self.sigmaEnd,
                             self.decayExponent)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.0001  # for soft update of target parameters
        self.learningRateActor = 0.00005
        self.learningRateCritic = 0.0005
        self.dropoutActor = 0.1
        self.dropoutCritic = 0.1

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learningRate=self.learningRateActor,
                                 dropoutRate=self.dropoutActor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learningRate=self.learningRateActor,
                                  dropoutRate=self.dropoutActor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learningRate=self.learningRateCritic,
                                   dropoutRate=self.dropoutCritic,
                                   l2Lambda=1e-2)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learningRate=self.learningRateCritic,
                                    dropoutRate=self.dropoutCritic,
                                    l2Lambda=1e-2)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.rewardSum = 0

    def reset_episode(self):
        self.rewardSum = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.rewardSum += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        noise = self.noise.sample()
        return list(action + noise), noise  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Example #26
0
class Agent(object):
    def __init__(self, state_size, action_size, max_action, minibatch_size,
                 a_lr, c_lr, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action

        self.critic_lr = c_lr
        self.actor_lr = a_lr

        self.actor_network = Actor(self.state_size, self.action_size,
                                   self.max_action, self.actor_lr)
        self.actor_target_network = Actor(self.state_size, self.action_size,
                                          self.max_action, self.actor_lr)
        self.critic_network = Critic(self.state_size, self.action_size,
                                     self.critic_lr)
        self.critic_target_network = Critic(self.state_size, self.action_size,
                                            self.critic_lr)

        self.actor_target_network.set_weights(self.actor_network.get_weights())
        self.critic_target_network.set_weights(
            self.critic_network.get_weights())

        self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr)
        self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr)

        self.replay_buffer = ReplayBuffer(1e6)
        self.MINIBATCH_SIZE = minibatch_size
        self.GAMMA = tf.cast(gamma, dtype=tf.float64)
        self.TAU = tau
        self.noise = OUNoise(self.action_size)

    def step(self, s, a, r, s_1, t, train=True):
        self.replay_buffer.add(s, a, r, s_1, t)
        if (train and self.replay_buffer.size() >= self.MINIBATCH_SIZE):
            minibatch = self.replay_buffer.sample_batch(self.MINIBATCH_SIZE)
            self.learn(minibatch)

    @tf.function
    def critic_train(self, minibatch):
        s_batch, a_batch, r_batch, s_1_batch, t_batch = minibatch

        mu_prime = self.actor_target_network(s_1_batch)
        q_prime = self.critic_target_network([s_1_batch, mu_prime])

        ys = r_batch + self.GAMMA * (1 - t_batch) * q_prime

        with tf.GradientTape() as tape:
            predicted_qs = self.critic_network([s_batch, a_batch])
            loss = (predicted_qs - ys) * (predicted_qs - ys)
            loss = tf.reduce_mean(loss)
        dloss = tape.gradient(loss, self.critic_network.trainable_weights)

        self.critic_optimizer.apply_gradients(
            zip(dloss, self.critic_network.trainable_weights))

    def actor_train(self, minibatch):
        s_batch, _, _, _, _ = minibatch

        with tf.GradientTape() as tape:
            next_action = self.actor_network(s_batch)
            actor_loss = -tf.reduce_mean(
                self.critic_network([s_batch, next_action]))
        actor_grad = tape.gradient(actor_loss,
                                   self.actor_network.trainable_weights)

        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_network.trainable_weights))

    def learn(self, minibatch):
        s, a, r, s_1, t = minibatch

        s = np.array(s, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                  self.state_size)
        s = tf.convert_to_tensor(s)
        a = np.array(a, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                  self.action_size)
        a = tf.convert_to_tensor(a)
        r = np.array(r, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1)
        s_1 = np.array(s_1, dtype=np.float64).reshape(self.MINIBATCH_SIZE,
                                                      self.state_size)
        s_1 = tf.convert_to_tensor(s_1)
        t = np.array(t, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1)

        minibatch = (s, a, r, s_1, t)

        self.critic_train(minibatch)
        self.actor_train(minibatch)
        self.update_target_networks()

    def act(self, state, t=0):
        state = np.array(state).reshape(1, self.state_size)
        action = self.actor_network(state)[0]
        noisy = self.noise.get_action(action, t)
        return action, noisy

    def update_target_networks(self):
        self.actor_target_network.set_weights(
            np.array(self.actor_network.get_weights()) * self.TAU +
            np.array(self.actor_target_network.get_weights()) * (1 - self.TAU))
        self.critic_target_network.set_weights(
            np.array(self.critic_network.get_weights()) * self.TAU +
            np.array(self.critic_target_network.get_weights()) *
            (1 - self.TAU))
Example #27
0
    }
    episodes = 301
    lr = .001
    gamma = .93
    alpha = .001
    epsilon = 1
    tau = 2500
    wait = 3000
    batch_size = 32
    maxLengthGame = 450
    Qprincipal = QNetwork(lr)
    Qtarget = QNetwork(lr)

    Qtarget.model.set_weights(Qprincipal.model.get_weights())

    rBuffer = ReplayBuffer(5000)
    count = 0
    results = []

    print(' Episode   |   Score   |    Loss  |  Rounds')
    for ep in range(episodes):
        loss = 0
        bots = [Robot() for i in range(3)]
        game = Game(player_names=[bot.name for bot in bots])
        epsilon = max(epsilon * .995, .1)
        winnings = 20000
        for i in range(maxLengthGame):
            maxLengthGame -= 1
            winnings -= 20
            count += 1
            if game.dieRoll == 7:
Example #28
0
				file_name = 'games/' + result + '_' + str(np.random.randint(500000))

			with open(file_name, 'wb') as f:
				pkl.dump(game, f)
			
			replay_buffer.save_game(game)
			print('Thread: {}, Game: {}, Result {}, Reward {}'.format(threading.get_ident(), i, result, terminal_value))
			
			


if __name__ == '__main__':

	remote = False
	config = Config()
	replay_buffer = ReplayBuffer(config)
	network = Network(config, remote=remote)

	num_epochs = 1000000

	for e in range(num_epochs):
		
		# Make network read-only so it can be run on multiple threads
		if not remote:
			network.graph.finalize()

		jobs = []

		for _ in range(config.num_actors):
			job = SelfPlay()
			job.start()
Example #29
0
        self.lrScheduler.step()
        return loss.item()
    def evalMotionModel(self,dataBatch):
        self.MotionModel.eval()
        actualNextStates = dataBatch[1][0]
        predictedNextStates = self.MotionModel(dataBatch[0])
        loss = self.criterion(actualNextStates,predictedNextStates)
        return loss.item()

if __name__ == '__main__':
    # check if cuda available
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    writer = SummaryWriter()

    # load replay buffer
    cpuReplayBuffer = ReplayBuffer(loadDataPrefix='simData/',saveDataPrefix='simData/',chooseCPU = True)
    cpuReplayBuffer.loadData(matchLoadSize=True)
    outputDataSTD = standardizeData()
    outputDataSTD.getDistribution(cpuReplayBuffer.outputData[0])
    cpuReplayBuffer.outputData[0] = outputDataSTD.whiten(cpuReplayBuffer.outputData[0])

    data = cpuReplayBuffer.getRandBatch()
    inStateDim = data[0][0].shape[1]
    inMapDim = data[0][1].shape[2]
    inActionDim = data[0][2].shape[1]
    outStateDim = data[1][0].shape[1]

    # training/ neural network parameters
    learningRate = 0.01
    lrDecay_stepSize = 3000
    lrDecay_gamma = 0.9
Example #30
0
if __name__ == "__main__":
    replayBufferLength = 500000
    numParallelSims = 16
    sims = []
    # set up simulations
    for i in range(numParallelSims):
        if i == -1:
            physicsClientId = p.connect(p.GUI)
        else:
            physicsClientId = p.connect(p.DIRECT)
        sims.append(simController(physicsClientId=physicsClientId))

    data = sims[0].controlLoopStep([0, 0])
    replayBuffer = ReplayBuffer(replayBufferLength,
                                data[0],
                                data[1],
                                saveDataPrefix='simData/',
                                chooseCPU=True)

    sTime = time.time()
    executor = concurrent.futures.ProcessPoolExecutor()
    while not replayBuffer.bufferFilled:
        results = executor.map(runSim, sims)
        for result in results:
            for data in result:
                replayBuffer.addData(data[0], data[1])
        print("replay buffer index: " + str(replayBuffer.bufferIndex) +
              ", rtf: " + str(replayBuffer.bufferIndex * 0.25 /
                              (time.time() - sTime)))
        print("estimated time left: " +
              str((replayBufferLength - replayBuffer.bufferIndex) /