Beispiel #1
0
    def __init__(self, env, DIRECTORY):
        self.batch_size = BATCH_SIZE
        self.replay_start_size = REPLAY_START_SIZE  # self.sub_batch_size = BATCH_SIZE / n_gpu

        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.trace_length = TRACE_LENGTH
        self.temp_abstract = TEMP_ABSTRACT
        self.actor_network = ActorNetwork(self.sess, BATCH_SIZE,
                                          self.state_dim, self.action_dim,
                                          self.temp_abstract, DIRECTORY)
        self.critic_network = CriticNetwork(self.sess, BATCH_SIZE,
                                            self.state_dim, self.action_dim,
                                            self.temp_abstract, DIRECTORY)

        # initialize replay buffer
        max_len_trajectory = self.environment.spec.timestep_limit + 1  # trace_length
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY,
                                          max_len_trajectory,
                                          self.actor_network.last_epi)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        ###
        self.diff = 0.
        self.discounting_mat_dict = {}
    def __init__(self, sess, number, model_path, global_episodes, explore,
                 decay, training):
        self.name = 'worker_' + str(number)  # name for uploading results
        self.number = number
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 41
        self.action_dim = 18
        self.model_path = model_path
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.sess = sess
        self.explore = explore
        self.decay = decay
        self.training = training

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim,
                                          self.name + '/actor')
        self.actor_network.update_target(self.sess)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim,
                                            self.name + '/critic')
        self.critic_network.update_target(self.sess)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        self.update_local_ops_actor = update_target_graph(
            'global/actor', self.name + '/actor')
        self.update_local_ops_critic = update_target_graph(
            'global/critic', self.name + '/critic')
Beispiel #3
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        # self.state_dim = env.observation_space.shape[0] * 2
        self.action_dim = env.action_space.shape[0]

        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        self.exploration_noise = OUNoise()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            my_config.logger.warn("Successfully loaded: %s" %
                                  (checkpoint.model_checkpoint_path))
        else:
            my_config.logger.error("Could not find old network weights")
Beispiel #4
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        # self.action_dim = env.action_space.shape[0]
        self.state_dim = env.state_size
        self.action_dim = env.action_size
        self.action_bound = (env.action_high - env.action_low) / 2

        print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim,
              'action_bound: ', self.action_bound)

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.action_bound)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.action_bound)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #5
0
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.OU = OU()

        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(save_location)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
Beispiel #6
0
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
    def __init__(self,
                 input_dim,
                 action_dim,
                 critic_layers,
                 actor_layers,
                 actor_activation,
                 scope='ac_network'):

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.scope = scope

        self.x = tf.placeholder(shape=(None, input_dim),
                                dtype=tf.float32,
                                name='x')
        self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y')

        with tf.variable_scope(scope):
            self.actor_network = ActorNetwork(self.x,
                                              action_dim,
                                              hidden_layers=actor_layers,
                                              activation=actor_activation)

            self.critic_network = CriticNetwork(
                self.x,
                self.actor_network.get_output_layer(),
                hidden_layers=critic_layers)

            self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
            self._build()
Beispiel #8
0
    def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma,
                 tau, batch_size, n_train, n_episode):
        # Gym environment
        self.env = env

        env_flattened = gym.wrappers.FlattenDictWrapper(
            env, dict_keys=['observation', 'achieved_goal', 'desired_goal'])

        # Get space sizes
        self.state_dim = env_flattened.observation_space.shape[0]
        #self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Get replay buffer and function get a batch from it
        self.replay_buffer = replay_buffer
        self.sample_batch = sample_batch

        self.sess = tf.InteractiveSession()

        # Hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_train = n_train
        self.n_episode = n_episode

        # Initialize networks
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim)
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim)

        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #9
0
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        self.epsilon_expert_range = (1.0, 0.1)
        self.epsilon_expert = self.epsilon_expert_range[0]
        self.epsilon_random_range = (0.1, 0.01)
        self.epsilon_random = self.epsilon_random_range[0]
        # Randomly initialize actor network and critic network
        # with both their target networks
        # self.state_dim = env.observation_space.shape[0]
        self.state_dim = 16
        # self.action_dim = env.action_space.shape[0]
        self.action_dim = 3
        self.time_step = 0
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        # self.exploration_noise = OUNoise(self.action_dim)
        # self.exploration_noise = OUNoise()
        self.OU = OU()
        # loading networks
        self.saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(MODEL_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            path = checkpoint.model_checkpoint_path
            self.saver.restore(self.sess, path)
            self.time_step = int(path[path.rindex('-') + 1:])
            self.epsilon_expert -= (
                self.epsilon_expert_range[0] -
                self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_expert = max(self.epsilon_expert,
                                      self.epsilon_expert_range[1])
            self.epsilon_random -= (
                self.epsilon_random_range[0] -
                self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT
            self.epsilon_random = max(self.epsilon_random,
                                      self.epsilon_random_range[1])
            logger.warn(
                "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s"
                % (path, self.time_step, self.epsilon_expert,
                   self.epsilon_random))
        else:
            logger.warn("Could not find old network weights")

        self.critic_cost = 0
Beispiel #10
0
    def __init__(self, env, state_dim, action_dim):
        self.name = 'DDPG'
        self.environment = env
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
Beispiel #11
0
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.actor = ActorNetwork(state_size, action_size)
        self.actor_target = ActorNetwork(state_size, action_size)
        self.soft_update(self.actor_target.parameters(),
                         self.actor.parameters(), 1)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=ACTOR_LEARNING_RATE)

        # Create one critic per agent
        self.critics = []
        self.critic_targets = []
        self.critic_optimizers = []
        for i in range(num_agents):

            # Critic
            # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated
            critic = CriticNetwork(state_size * num_agents,
                                   action_size * num_agents)
            self.critics.append(critic)
            self.critic_targets.append(
                CriticNetwork(state_size * num_agents,
                              action_size * num_agents))
            self.soft_update(self.critic_targets[-1].parameters(),
                             critic.parameters(), 1)
            self.critic_optimizers.append(
                optim.Adam(critic.parameters(),
                           lr=CRITIC_LEARNING_RATE,
                           weight_decay=CRITIC_WEIGHT_DECAY))

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess((1, action_size),
                                                       sigma=RANDOM_SIGMA,
                                                       theta=RANDOM_THETA)
Beispiel #12
0
    def __init__(self, sess, data_fname):
        self.name = 'DDPG'
        # Randomly initialize actor network and critic network
        # with both their target networks

        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = Hp.state_dim
        self.action_dim = Hp.action_dim
        print(self.state_dim, self.action_dim)

        self.sess = sess

        self.state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.target_state_input = [
            tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord))
            for _ in xrange(Hp.categories)
        ]
        #tf.placeholder("float",[None,self.state_dim])
        self.state_network = StateEnc(self.sess, self.state_input,
                                      self.target_state_input)
        state_batch = self.state_network.encoding
        next_state_batch = self.state_network.target_encoding

        weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters(
        )

        state_network_params = weights + biases + [
            w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2
        ]

        self.actor_network = ActorNetwork(self.sess, Hp.n_hidden,
                                          self.action_dim, self.state_input,
                                          state_batch, next_state_batch,
                                          state_network_params)
        self.critic_network = CriticNetwork(self.sess, Hp.n_hidden,
                                            self.action_dim, state_batch,
                                            next_state_batch)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname)
        self.summary_str2 = None

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #13
0
    def __init__(self, track_name='practgt2.xml'):
        BUFFER_SIZE = 100000
        TAU = 0.001  # Target Network HyperParameters
        LRA = 0.0001  # Learning rate for Actor
        LRC = 0.001  # Lerning rate for Critic
        state_dim = 29  # of sensors input
        self.batch_size = 32
        self.lambda_mix = 10.0
        self.action_dim = 3  # Steering/Acceleration/Brake

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(sess)

        self.actor = ActorNetwork(sess, state_dim, self.action_dim,
                                  self.batch_size, TAU, LRA)
        self.critic = CriticNetwork(sess, state_dim, self.action_dim,
                                    self.batch_size, TAU, LRC)
        self.buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer
        self.track_name = track_name

        self.save = dict(total_reward=[],
                         total_step=[],
                         ave_reward=[],
                         distRaced=[],
                         distFromStart=[],
                         lastLapTime=[],
                         curLapTime=[],
                         lapTimes=[],
                         avelapTime=[],
                         ave_sp=[],
                         max_sp=[],
                         min_sp=[],
                         test_total_reward=[],
                         test_total_step=[],
                         test_ave_reward=[],
                         test_distRaced=[],
                         test_distFromStart=[],
                         test_lastLapTime=[],
                         test_curLapTime=[],
                         test_lapTimes=[],
                         test_avelapTime=[],
                         test_ave_sp=[],
                         test_max_sp=[],
                         test_min_sp=[])
    def __init__(self, env,device):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        self.device=device
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        
        self.actor_network = ActorNetwork(self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #15
0
	def __init__(self, env):
		self.sess = tf.InteractiveSession()
		#self.params = loadparams() # ???
		self.env = env
		self.n_states = env.observation_space.shape[0]
		self.n_actions = env.action_space.shape[0]
		self.low = self.env.action_space.low
		self.high = self.env.action_space.high
		self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions)
		self.trainable_var_count = self.actor_network.get_trainable_var_count()
		self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \
			self.actor_network, self.trainable_var_count)
		self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']???
		self.exploration_noise = OUNoise(self.n_actions)
		# self.noise = Noise()
		self.gamma = GAMMA
		self.sess.run(tf.global_variables_initializer())
Beispiel #16
0
 def __init__(self, num_agents, state_dim, action_dim):
     # track training times
     self.time_step = 0
     # use set session use GPU
     #self.sess = tf.InteractiveSession()
     self.sess = tf.Session(config=tf.ConfigProto(
         log_device_placement=True))
     self.num_agents = num_agents
     self.state_dim = state_dim
     self.action_dim = action_dim
     self.agents = self.create_multi_agents(self.sess, num_agents,
                                            self.state_dim, self.action_dim)
     # make sure create Criticnetwork later, summarise mean Q value inside
     self.critic = CriticNetwork(self.sess, state_dim, action_dim)
     self.exploration_noise = OUNoise((self.num_agents, action_dim))
     self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
     # for store checkpoint
     self.saver = tf.train.Saver()
Beispiel #17
0
    def __init__(self, state_dim, action_dim, env):
        self.name = 'DDPG'  # name for uploading results
        self.time_step = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.environment = env
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #18
0
    def __init__(self, state_dim, action_dim):
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #19
0
    def __init__(self, environment):
        self.name = 'DDPG'  # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        self.critic_network = CriticNetwork(
            state_size=environment.observation_space.shape[0],
            action_size=environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0
Beispiel #20
0
    def __init__(self, sess, env, par_idx):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.par_idx = par_idx
        self.sess = sess

        with tf.variable_scope("particle_" + str(par_idx)):
            self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                              self.action_dim, self.par_idx)
            self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                                self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #21
0
    def __init__(self):
        self.name = 'DDPG'  # name for uploading results
        # self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = 12
        self.action_dim = 10
        self.has_kicked = False
        self.laststep_haskicked = False
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        self.saver = tf.train.Saver(max_to_keep=1)
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #22
0
    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return
Beispiel #23
0
    def __init__(self, env_name, sess, state_dim, action_dim, models_dir,
                 img_dim):
        self.name = 'DDPG'
        self.env_name = env_name
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.img_dim = img_dim
        self.models_dir = models_dir

        # Ensure action bound is symmetric
        self.time_step = 0
        self.sess = sess

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim, self.img_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim, self.img_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        self.saver = tf.train.Saver()
Beispiel #24
0
    def __init__(self, env,loadfilename=None,printVars=False):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        
        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)

        #print 'init complete'
        self.all_vars = tf.global_variables()
        if printVars:
            for v in self.all_vars:
                print v.name.ljust(30), v.shape
        
        self.saver = tf.train.Saver(self.all_vars)
        if loadfilename is not None:
            self.saver.restore(self.sess, loadfilename)
Beispiel #25
0
    def __init__(self, state_dim, action_dim):
        """name for uploading resuults"""
        self.name = 'DDPG'
        self.time_step = 0
        # self.atten_rate = 1
        """Randomly initialize actor network and critic network"""
        """and both their target networks"""
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)
        """initialize replay buffer"""
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        """Initialize a random process the Ornstein-Uhlenbeck process for action exploration"""
        self.exploration_noise = OUNoise(self.action_dim)
        """Initialize a Treading"""
        self.threading = threading.Thread(target=self.train,
                                          name='LoopThread--DDPG')
Beispiel #26
0
    def __init__(self, env):
        # ------------------- init the (NN) & (Buf) & (explor noise) & (counter) -------------------
        self.name = 'DDPG'  # name for uploading results
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env[0]
        self.action_dim = env[1]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.epsilon_max = 1.0
        self.epsilon_min = 0.01
        self.counter = 0
Beispiel #27
0
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0
Beispiel #28
0
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
Beispiel #29
0
    def initialize(self):

        self.net = {}
        self.opt = {}

        if self.pars["twin"]:
            self.twins = ["_1", "_2"]
        else:
            self.twins = [""]

        for netname in ["act_loc", "act_tg"]:
            self.net[netname] = LinearNetwork(
                input_shape=self.pars["state_size"],
                lin_layers=self.pars["actor_layers"],
                output_shape=self.pars["act_size"],
                seed=self.pars["seed"])
        self.opt["act"] = torch.optim.Adam(self.net["act_loc"].parameters(),
                                           lr=self.pars["lr_act"])

        for twin in self.twins:
            for netname in ["crit_loc", "crit_tg"]:
                self.net[netname + twin] = CriticNetwork(
                    input_shape=self.pars["state_size"],
                    lin_layers=self.pars["crit_layers"],
                    output_shape=(1, ),
                    action_layer=self.pars["act_input"],
                    action_shape=self.pars["act_size"],
                    seed=self.pars["seed"])
            self.opt["crit" + twin] = torch.optim.Adam(
                self.net["crit_loc" + twin].parameters(),
                lr=self.pars["lr_crit"])

        if self.pars["erep_eps"] < 1.0:
            self.mem = ExperienceReplayer(
                self.pars["erep_size"],
                wait_fill=self.pars["erep_fill"],
                default_prio=self.pars["erep_def_prio"],
                epsilon=self.pars["erep_eps"])
        else:
            self.mem = ExperienceReplayer(self.pars["erep_size"],
                                          wait_fill=self.pars["erep_fill"])

        self.train_cr_count = 0
Beispiel #30
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG' # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim)
        self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim)
        
        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim])

        # Calculate y_batch
        
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch)
        y_batch = []  
        for i in range(len(minibatch)): 
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else :
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch,[BATCH_SIZE,1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self,state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action+self.exploration_noise.noise()

    def action(self,state):
        action = self.actor_network.action(state)
        return action

    def perceive(self,state,action,reward,next_state,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state,action,reward,next_state,done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() >  REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
            #self.actor_network.save_network(self.time_step)
            #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Beispiel #31
0
    def main(self):
        np.random.seed(0)
        replay_memory = deque(maxlen=REPLAY_MEM_CAPACITY)

        def add_to_memory(experience):
            replay_memory.append(experience)

        def sample_from_memory(minibatch_size):
            return random.sample(replay_memory, minibatch_size)

        tf.reset_default_graph()

        # placeholders
        state_placeholder = tf.placeholder(dtype=tf.float32,
                                           shape=[None, STATE_DIM])
        action_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, ACTION_DIM])
        reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None])
        next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                                shape=[None, STATE_DIM])
        # indicators (go into target computation)
        is_not_terminal_placeholder = tf.placeholder(dtype=tf.float32,
                                                     shape=[None])
        is_training_placeholder = tf.placeholder(dtype=tf.bool,
                                                 shape=())  # for dropout

        # episode counter
        episodes = tf.Variable(0.0, trainable=False, name='episodes')
        episode_incr_op = episodes.assign_add(1)

        # actor network
        with tf.variable_scope('actor'):
            actor = Actor(STATE_DIM,
                          ACTION_DIM,
                          HIDDEN_1_ACTOR,
                          HIDDEN_2_ACTOR,
                          HIDDEN_3_ACTOR,
                          trainable=True)
            '''              
            Policy's outputted action for each state_ph (for generating 
            actions and training the critic)
            '''
            actions_unscaled = actor.call(state_placeholder)
            actions = MIN_BANDWIDTH + tf.nn.sigmoid(actions_unscaled) * (
                MAX_BANDWIDTH - MIN_BANDWIDTH)

        # slow target actor network
        with tf.variable_scope('target_actor', reuse=False):
            target_actor = Actor(STATE_DIM,
                                 ACTION_DIM,
                                 HIDDEN_1_ACTOR,
                                 HIDDEN_2_ACTOR,
                                 HIDDEN_3_ACTOR,
                                 trainable=True)
            '''
            Slow target policy's outputted action for each next_state_ph 
            (for training the critic)
            use stop_gradient to treat the output values as constant targets 
            when doing backprop
            '''
            target_next_actions_unscaled = target_actor.call(
                next_state_placeholder)
            target_next_actions_1 = MIN_BANDWIDTH + tf.nn.sigmoid(\
                target_next_actions_unscaled)*(MAX_BANDWIDTH - MIN_BANDWIDTH)
            target_next_actions = tf.stop_gradient(target_next_actions_1)

        with tf.variable_scope('critic') as scope:
            critic = Critic(STATE_DIM,
                            ACTION_DIM,
                            HIDDEN_1_CRITIC,
                            HIDDEN_2_CRITIC,
                            HIDDEN_3_CRITIC,
                            trainable=True)
            # Critic applied to state_ph and a given action(for training critic)
            q_values_of_given_actions = critic.call(state_placeholder,
                                                    action_placeholder)
            '''
            Critic applied to state_ph and the current policy's outputted 
            actions for state_ph (for training actor via deterministic 
            policy gradient)
            '''
            q_values_of_suggested_actions = critic.call(
                state_placeholder, actions)

        # slow target critic network
        with tf.variable_scope('target_critic', reuse=False):
            target_critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC,
                                   HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, \
                                       trainable=True)
            '''
            Slow target critic applied to slow target actor's outputted 
            actions for next_state_ph (for training critic)
            '''
            q_values_next = tf.stop_gradient(
                target_critic.call(next_state_placeholder,
                                   target_next_actions))

        # isolate vars for each network
        actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope='actor')
        target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope='target_actor')
        critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='critic')
        target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='target_critic')

        # update slowly-changing targets towards current actor and critic
        update_target_ops = []
        for i, target_actor_var in enumerate(target_actor_vars):
            update_target_actor_op = target_actor_var.assign(
                TAU * actor_vars[i] + (1 - TAU) * target_actor_var)
            update_target_ops.append(update_target_actor_op)

        for i, target_var in enumerate(target_critic_vars):
            target_critic_op = target_var.assign(TAU * critic_vars[i] +
                                                 (1 - TAU) * target_var)
            update_target_ops.append(target_critic_op)

        update_targets_op = tf.group(*update_target_ops,
                                     name='update_slow_targets')
        '''
        # One step TD targets y_i for (s,a) from experience replay
        # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
        # = r_i if s' terminal
        '''
        targets = tf.expand_dims(
            reward_placeholder, 1) + tf.expand_dims(is_not_terminal_placeholder,\
                 1) * GAMMA * q_values_next

        # 1-step temporal difference errors
        td_errors = targets - q_values_of_given_actions

        # critic loss function (mean-square value error with regularization)
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        for var in critic_vars:
            if not 'bias' in var.name:
                critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

        # critic optimizer
        critic_train_op = tf.train.AdamOptimizer(
            LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)

        # actor loss function (mean Q-values under current policy with
        # regularization)
        actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
        for var in actor_vars:
            if not 'bias' in var.name:
                actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)
        '''
        actor optimizer
        the gradient of the mean Q-values wrt actor params is the 
        deterministic policy gradient (keeping critic params fixed)
        '''

        actor_train_op = tf.train.AdamOptimizer(
            LEARNING_RATE_ACTOR*LR_DECAY**episodes).minimize(actor_loss, \
                var_list=actor_vars)

        # initialize session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        # print(sess.run(tf.report_uninitialized_variables()))

        ## Training

        num_steps = 0
        for episode in range(NUM_EPISODES):
            total_reward = 0
            num_steps_in_episode = 0

            # Create noise
            noise = np.zeros(ACTION_DIM)
            noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
                (MAX_BANDWIDTH - MIN_BANDWIDTH)  # TODO: uses env

            # Initial state
            self.reset()  # TODO: uses env
            state = self.input_state

            for t in range(MAX_STEPS_PER_EPISODE):
                # choose action based on deterministic policy
                state = np.asarray(state)
                state = state.reshape(1, state.shape[0])
                action, = sess.run(actions,
                                   feed_dict={state_placeholder: state, \
                                       is_training_placeholder: False})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise = EXPLORATION_THETA * \
                    (EXPLORATION_MU - noise) + \
                    EXPLORATION_SIGMA*np.random.randn(ACTION_DIM)
                action += noise_scale * noise

                # take step
                next_state, reward, done, = self.step(action)
                total_reward += reward

                add_to_memory((
                    state,
                    action,
                    reward,
                    next_state,
                    #    is next_observation a terminal state?
                    #    0.0 if done and not env.env._past_limit() else
                    #    1.0))
                    0.0 if done else 1.0))

                # update network weights to fit a minibatch of experience
                if num_steps % TRAIN_EVERY == 0 and \
                    len(replay_memory) >= MINI_BATCH_SIZE:

                    minibatch = sample_from_memory(MINI_BATCH_SIZE)
                    '''
                    update the critic and actor params using mean-square value 
                    error and deterministic policy gradient, respectively
                    '''
                    _, _ = sess.run([critic_train_op, actor_train_op],
                                    feed_dict={
                        state_placeholder: np.asarray([elem[0] for elem in \
                            minibatch]),
                        action_placeholder: np.asarray([elem[1] for elem in \
                            minibatch]),
                        reward_placeholder: np.asarray([elem[2] for elem in \
                            minibatch]),
                        next_state_placeholder: np.asarray([elem[3] for elem in\
                             minibatch]),
                        is_not_terminal_placeholder: np.asarray([elem[4] for \
                            elem in minibatch]),

                        is_training_placeholder: True})
                    '''
                    update slow actor and critic targets towards current actor 
                    and critic
                    '''
                    _ = sess.run(update_targets_op)

                state = next_state
                num_steps += 1
                num_steps_in_episode += 1

                if done:
                    # Increment episode counter
                    _ = sess.run(episode_incr_op)
                    break

            print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: \
                %7.3f'                       % (episode, total_reward, num_steps_in_episode, \
                    noise_scale))
Beispiel #32
0
class DDPGAgent():
    """
    Deep deterministic policy gradient agent as described in
    https://arxiv.org/abs/1509.02971.

    This agent is meant to operate on low dimensional inputs, not raw pixels.

    To use the agent, you can get action predictions using act(), and to teach
    the agent, feed the results to learn.
    """
    def __init__(self, state_size, action_size, num_agents):
        """ Initialize agent.

        Params
        ======
        state_size (integer): Size of input state vector
        action_size (integer): Size of action vector
        num_agents (integer): Number of simultaneous agents in the environment
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor
        self.local_actor_network = ActorNetwork(state_size, action_size)
        self.target_actor_network = ActorNetwork(state_size, action_size)
        self.actor_optimizer = optim.Adam(
            self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE)

        # Critic
        self.local_critic_network = CriticNetwork(state_size, action_size)
        self.target_critic_network = CriticNetwork(state_size, action_size)
        self.critic_optimizer = optim.Adam(
            self.local_critic_network.parameters(),
            lr=CRITIC_LEARNING_RATE,
            weight_decay=CRITIC_WEIGHT_DECAY)

        self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE,
                                          None)
        self.steps = 0
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.random_process = OrnsteinUhlenbeckProcess(
            (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)

    def act(self, states, noise=True):
        """
        Returns an action vector based on the current game state.

        Params
        ======
        states (array_like): A matrix of game states (each row represents the
            state of an agent)
        noise (boolean): Add random noise to the predicted action.  Aids
            exploration of the environment during training.
        """

        self.local_actor_network.eval()
        with torch.no_grad():
            actions = self.local_actor_network(
                torch.tensor(states, dtype=torch.float32)).detach().numpy()
        self.local_actor_network.train()
        if noise:
            actions = actions + self.random_process.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def vectorize_experiences(self, experiences):
        """Vectorizes experience objects for use by pytorch

        Params
        ======
            experiences (array_like of Experience objects): Experiences to
                vectorize
        """
        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(self.device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(self.device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences if e is not None
                       ]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones)

    def normalize(self, to_normalize):
        """
        Normalize the each row of the input along the 0 dimension using the
        formula (value - mean)/std

        Params
        ======
        to_normalize (array_like): Values to normalize
        """

        std = to_normalize.std(0)
        mean = to_normalize.mean(0)
        return (to_normalize - mean) / (std + 1e-5)

    def soft_update(self, target_parameters, local_parameters):
        """
        Updates the given target network parameters with the local parameters
        using a soft update strategy: tau * local + (1-tau) * target
        """

        for target, local in zip(target_parameters, local_parameters):
            target.data.copy_(TAU * local.data + (1.0 - TAU) * target.data)

    def train(self, experiences):
        """
        Trains the actor and critic networks using a minibatch of experiences

        Params
        ======
        experiences (array_like of Experience): Minibatch of experiences
        """
        states, actions, rewards, next_states, dones = self.vectorize_experiences(
            experiences)
        #states = self.normalize(states)
        #next_states = self.normalize(next_states)
        rewards = self.normalize(rewards)

        # Use the target critic network to calculate a target q value
        next_actions = self.target_actor_network(next_states)
        q_target = rewards + GAMMA * self.target_critic_network(
            next_states, next_actions) * (1 - dones)

        # Calculate the predicted q value
        q_predicted = self.local_critic_network(states, actions)

        # Update critic network
        critic_loss = F.mse_loss(q_predicted, q_target)
        #print(critic_loss)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.local_critic_network.parameters(),
                                       1)
        self.critic_optimizer.step()

        # Update predicted action using policy gradient
        actions_predicted = self.local_actor_network(states)
        #print(self.local_critic_network(states, actions_predicted).mean())
        policy_loss = -self.local_critic_network(states,
                                                 actions_predicted).mean()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        #print(policy_loss)
        self.actor_optimizer.step()

        self.soft_update(self.target_actor_network.parameters(),
                         self.local_actor_network.parameters())
        self.soft_update(self.target_critic_network.parameters(),
                         self.local_critic_network.parameters())

    def learn(self, experience):
        """
        Tells the agent to learn from an experience.  This may not immediately
        result in training since this agent uses a replay buffer.

        Params
        ======
        experience (Experience): An experience used to teach the agent.
        """
        self.replay_buffer.add(experience)
        self.steps += 1
        if self.steps % STEPS_BETWEEN_TRAINING == 0 and len(
                self.replay_buffer) >= BATCH_SIZE:
            for i in range(ITERATIONS_PER_TRAINING):
                self.train(self.replay_buffer.sample(BATCH_SIZE))

    def save(self, filename):
        """Saves learned params of underlying networks to a checkpoint file.

        Params
        ======
            filename (string): Target file.  agent- and critic- are prepended
                for the agent and critic network, respectively
        """
        torch.save(self.local_actor_network.state_dict(), "actor-" + filename)
        torch.save(self.local_critic_network.state_dict(),
                   "critic-" + filename)

    def load(self, filename):
        """Loads learned params generated by save() into underlying networks.

            filename (string): Path to file.  There should be an agent- and
            critic- version of this file.
        """
        self.local_actor_network.load_state_dict(
            torch.load("actor-" + filename))
        self.target_actor_network.load_state_dict(
            torch.load("actor-" + filename))

        self.local_critic_network.load_state_dict(
            torch.load("critic-" + filename))
        self.target_critic_network.load_state_dict(
            torch.load("critic-" + filename))

    def end_episode(self):
        """
        Tell the agent that an episode is complete.
        """
        self.random_process.reset()
        self.steps = 0
Beispiel #33
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env, results_file):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.InteractiveSession()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)

        results_file.write(ActorNetwork.get_settings())

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.count() > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Beispiel #34
0
                       mask_inner=True,
                       mask_logits=True,
                       normalization=opts.normalization,
                       tanh_clipping=opts.tanh_clipping), opts.use_cuda)

    # Overwrite model parameters by parameters to load
    model.load_state_dict({**model.state_dict(), **load_data.get('model', {})})

    # Initialize baseline
    if opts.baseline == 'exponential':
        baseline = ExponentialBaseline(opts.exp_beta)
    elif opts.baseline == 'critic':
        baseline = CriticBaseline(
            maybe_cuda_model(
                CriticNetwork(problem.NODE_DIM, opts.embedding_dim,
                              opts.hidden_dim, opts.n_encode_layers,
                              opts.normalization), opts.use_cuda))
    elif opts.baseline == 'rollout':
        baseline = RolloutBaseline(model, problem, opts)
    else:
        assert opts.baseline is None, "Unknown baseline: {}".format(
            opts.baseline)
        baseline = NoBaseline()

    # Load baseline from data, make sure script is called with same type of baseline
    if 'baseline' in load_data:
        baseline.load_state_dict(load_data['baseline'])

    # Initialize optimizer
    optimizer = optim.Adam([{
        'params': model.parameters(),
Beispiel #35
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, environment):
        self.name = 'DDPG' # name for uploading results
        self.environment = environment
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0])
        # initialize replay buffer
        self.replay_buffer = deque()

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(environment.action_space.shape[0])

        # Initialize time step
        self.time_step = 0

    def set_init_observation(self,observation):
        # receive initial observation state
        self.state = observation

    def train(self):
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = random.sample(self.replay_buffer,BATCH_SIZE)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]

        action_batch = np.resize(action_batch,[BATCH_SIZE,1])

        # Calculate y
        y_batch = []
        next_action_batch = self.actor_network.target_evaluate(next_state_batch)
        q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch)
        for i in range(0,BATCH_SIZE):
            done = minibatch[i][4]
            if done:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch,state_batch,action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.evaluate(state_batch)
        q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE

        self.actor_network.train(q_gradient_batch,state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def get_action(self):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.get_action(self.state)
        return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high)

    def set_feedback(self,observation,action,reward,done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        next_state = observation
        self.replay_buffer.append((self.state,action,reward,next_state,done))
        # Update current state
        self.state = next_state
        # Update time step
        self.time_step += 1

        # Limit the replay buffer size
        if len(self.replay_buffer) > REPLAY_BUFFER_SIZE:
            self.replay_buffer.popleft()

        # Store transitions to replay start size then start training
        if self.time_step >  REPLAY_START_SIZE:
            self.train()

        if self.time_step % 10000 == 0:
            self.actor_network.save_network(self.time_step)
            self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Beispiel #36
0
class DDPG:

    def __init__(self, state_dim, state_channel, action_dim):
        self.state_dim = state_dim
        self.state_channel = state_channel
        self.action_dim = action_dim

        self.sess = tf.InteractiveSession()
        self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel])
        self.action_input = tf.placeholder('float', [None, action_dim])

        self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim)

        # create network
        self.actor_network.create_network(self.state_input)
        self.critic_network.create_q_network(self.state_input, self.actor_network.action_output)

        # create target network
        self.actor_network.create_target_network(self.target_state_input)
        self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output)

        # create training method
        self.actor_network.create_training_method(self.critic_network.q_value_output)
        self.critic_network.create_training_method()

        self.sess.run(tf.initialize_all_variables())
        self.actor_network.update_target()
        self.critic_network.update_target()

        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
        self.exploration_noise = OUNoise(self.action_dim)

        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg'
        if not os.path.exists(self.dir_path):
            os.mkdir(self.dir_path)

        # for log
        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)
        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)
        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph)

        self.episode_reward = 0.0
        self.episode_start_time = 0.0

        self.time_step = 1
        self.saver = tf.train.Saver(tf.all_variables())
        self.load_time_step()
        self.load_network()
        return

    def train(self):
        action_dim = self.action_dim

        minibatch = self.replay_buffer.get_batch(BATCH_SIZE)  # sample BATCH_SIZE from replay_buffer
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # if action_dim = 1, it's a number not a array
        action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim])

        # calculate y_batch via target network
        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch)

        y_batch = []
        for i in range(BATCH_SIZE):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # print np.shape(reward_batch), np.shape(y_batch)

        # train actor network
        self.actor_network.train(state_batch)

        # train critic network
        self.critic_network.train(y_batch, state_batch, action_batch)

        # update target network
        self.actor_network.update_target()
        self.critic_network.update_target()
        return

    def noise_action(self, state):
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def _record_log(self, reward, living_time):
        summary_str = self.sess.run(self.summary_op, feed_dict={
            self.reward_input: reward,
            self.time_input: living_time
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        return

    def perceive(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        if self.episode_start_time == 0.0:
            self.episode_start_time = time.time()
        # for testing
        # self.time_step += 1
        # if self.time_step == 100:
        #     print '--------------------------------'
        #     self.replay_buffer.save_to_pickle()
        # return
        
        self.episode_reward += reward
        living_time = time.time() - self.episode_start_time
        if self.time_step % 1000 == 0 or done:
            self._record_log(self.episode_reward, living_time)

        if self.replay_buffer.size() > REPLAY_START_SIZE:
            self.train()

        if self.time_step % 100000 == 0:
            self.save_network()

        if done:
            print '===============reset noise========================='
            self.exploration_noise.reset()
            self.episode_reward = 0.0
            self.episode_start_time = time.time()

        self.time_step += 1
        return

    def load_time_step(self):
        if not os.path.exists(self.dir_path):
            return
        files = os.listdir(self.dir_path)
        step_list = []
        for filename in files:
            if ('meta' in filename) or ('-' not in filename):
                continue
            step_list.append(int(filename.split('-')[-1]))
        step_list = sorted(step_list)
        if len(step_list) == 0:
            return
        self.time_step = step_list[-1] + 1
        return

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(self.dir_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print 'Successfully loaded:', checkpoint.model_checkpoint_path
        else:
            print 'Could not find old network weights'
        return

    def save_network(self):
        print 'save actor-critic network...', self.time_step
        self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step)
        return