def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic')
def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {}
def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
def __init__(self, input_dim, action_dim, critic_layers, actor_layers, actor_activation, scope='ac_network'): self.input_dim = input_dim self.action_dim = action_dim self.scope = scope self.x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='x') self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y') with tf.variable_scope(scope): self.actor_network = ActorNetwork(self.x, action_dim, hidden_layers=actor_layers, activation=actor_activation) self.critic_network = CriticNetwork( self.x, self.actor_network.get_output_layer(), hidden_layers=critic_layers) self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._build()
def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.local_actor_network = ActorNetwork(state_size, action_size) self.target_actor_network = ActorNetwork(state_size, action_size) self.actor_optimizer = optim.Adam( self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE) # Critic self.local_critic_network = CriticNetwork(state_size, action_size) self.target_critic_network = CriticNetwork(state_size, action_size) self.critic_optimizer = optim.Adam( self.local_critic_network.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess( (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights")
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0
def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.actor = ActorNetwork(state_size, action_size) self.actor_target = ActorNetwork(state_size, action_size) self.soft_update(self.actor_target.parameters(), self.actor.parameters(), 1) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LEARNING_RATE) # Create one critic per agent self.critics = [] self.critic_targets = [] self.critic_optimizers = [] for i in range(num_agents): # Critic # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated critic = CriticNetwork(state_size * num_agents, action_size * num_agents) self.critics.append(critic) self.critic_targets.append( CriticNetwork(state_size * num_agents, action_size * num_agents)) self.soft_update(self.critic_targets[-1].parameters(), critic.parameters(), 1) self.critic_optimizers.append( optim.Adam(critic.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY)) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess((1, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[])
def __init__(self, env): self.sess = tf.InteractiveSession() #self.params = loadparams() # ??? self.env = env self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.low = self.env.action_space.low self.high = self.env.action_space.high self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions) self.trainable_var_count = self.actor_network.get_trainable_var_count() self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \ self.actor_network, self.trainable_var_count) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']??? self.exploration_noise = OUNoise(self.n_actions) # self.noise = Noise() self.gamma = GAMMA self.sess.run(tf.global_variables_initializer())
def create_multi_agents(self, sess, num_agents, state_dim, action_dim): agents = [] nets = None for ii in range(num_agents): agent_name = 'agent' + str(ii) agents.append( ActorNetwork(sess, state_dim, action_dim, agent_name, nets)) nets = agents[-1].nets return agents
def __init__(self, state_dim, action_dim): self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8)
def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0
def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return
def __init__(self, env_name, sess, state_dim, action_dim, models_dir, img_dim): self.name = 'DDPG' self.env_name = env_name self.state_dim = state_dim self.action_dim = action_dim self.img_dim = img_dim self.models_dir = models_dir # Ensure action bound is symmetric self.time_step = 0 self.sess = sess self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.saver = tf.train.Saver()
def __init__(self): self._init_setup() self.viewer = None self.action_space = spaces.Box(self.act_low, self.act_high) self.observation_space = spaces.Box(self.obs_low, self.obs_high) self._seed() self._reset() self.dt = 0.01 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.observation_space.shape[0], self.action_space.shape[0]) self.goal_state = np.zeros(shape=3)
def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings())
def __init__(self, env): # ------------------- init the (NN) & (Buf) & (explor noise) & (counter) ------------------- self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env[0] self.action_dim = env[1] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.epsilon_max = 1.0 self.epsilon_min = 0.01 self.counter = 0
def __init__(self, env,loadfilename=None,printVars=False): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) #print 'init complete' self.all_vars = tf.global_variables() if printVars: for v in self.all_vars: print v.name.ljust(30), v.shape self.saver = tf.train.Saver(self.all_vars) if loadfilename is not None: self.saver.restore(self.sess, loadfilename)
def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0
def add_agents(self, add_num): for ii in range(add_num): #self.num_agents+=1 agent_name = 'agent' + str(self.num_agents) self.agents.append( ActorNetwork(self.sess, self.state_dim, self.action_dim, agent_name, self.agents[-1].nets)) # the agents' name is from 0-num_agents-1 self.num_agents += 1 # if add a new agent then reset the noise and replay buffer self.exploration_noise = OUNoise((self.num_agents, self.action_dim)) #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.replay_buffer.erase() # re-create a saver # the new saver will contains all the savable variables. # otherwise only contains the initially created agents self.saver = tf.train.Saver()
def main(self): np.random.seed(0) replay_memory = deque(maxlen=REPLAY_MEM_CAPACITY) def add_to_memory(experience): replay_memory.append(experience) def sample_from_memory(minibatch_size): return random.sample(replay_memory, minibatch_size) tf.reset_default_graph() # placeholders state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM]) action_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, ACTION_DIM]) reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None]) next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM]) # indicators (go into target computation) is_not_terminal_placeholder = tf.placeholder(dtype=tf.float32, shape=[None]) is_training_placeholder = tf.placeholder(dtype=tf.bool, shape=()) # for dropout # episode counter episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) # actor network with tf.variable_scope('actor'): actor = Actor(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) ''' Policy's outputted action for each state_ph (for generating actions and training the critic) ''' actions_unscaled = actor.call(state_placeholder) actions = MIN_BANDWIDTH + tf.nn.sigmoid(actions_unscaled) * ( MAX_BANDWIDTH - MIN_BANDWIDTH) # slow target actor network with tf.variable_scope('target_actor', reuse=False): target_actor = Actor(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) ''' Slow target policy's outputted action for each next_state_ph (for training the critic) use stop_gradient to treat the output values as constant targets when doing backprop ''' target_next_actions_unscaled = target_actor.call( next_state_placeholder) target_next_actions_1 = MIN_BANDWIDTH + tf.nn.sigmoid(\ target_next_actions_unscaled)*(MAX_BANDWIDTH - MIN_BANDWIDTH) target_next_actions = tf.stop_gradient(target_next_actions_1) with tf.variable_scope('critic') as scope: critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) # Critic applied to state_ph and a given action(for training critic) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) ''' Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient) ''' q_values_of_suggested_actions = critic.call( state_placeholder, actions) # slow target critic network with tf.variable_scope('target_critic', reuse=False): target_critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, \ trainable=True) ''' Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic) ''' q_values_next = tf.stop_gradient( target_critic.call(next_state_placeholder, target_next_actions)) # isolate vars for each network actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') # update slowly-changing targets towards current actor and critic update_target_ops = [] for i, target_actor_var in enumerate(target_actor_vars): update_target_actor_op = target_actor_var.assign( TAU * actor_vars[i] + (1 - TAU) * target_actor_var) update_target_ops.append(update_target_actor_op) for i, target_var in enumerate(target_critic_vars): target_critic_op = target_var.assign(TAU * critic_vars[i] + (1 - TAU) * target_var) update_target_ops.append(target_critic_op) update_targets_op = tf.group(*update_target_ops, name='update_slow_targets') ''' # One step TD targets y_i for (s,a) from experience replay # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal # = r_i if s' terminal ''' targets = tf.expand_dims( reward_placeholder, 1) + tf.expand_dims(is_not_terminal_placeholder,\ 1) * GAMMA * q_values_next # 1-step temporal difference errors td_errors = targets - q_values_of_given_actions # critic loss function (mean-square value error with regularization) critic_loss = tf.reduce_mean(tf.square(td_errors)) for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # critic optimizer critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) # actor loss function (mean Q-values under current policy with # regularization) actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) ''' actor optimizer the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed) ''' actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR*LR_DECAY**episodes).minimize(actor_loss, \ var_list=actor_vars) # initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) # print(sess.run(tf.report_uninitialized_variables())) ## Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (MAX_BANDWIDTH - MIN_BANDWIDTH) # TODO: uses env # Initial state self.reset() # TODO: uses env state = self.input_state for t in range(MAX_STEPS_PER_EPISODE): # choose action based on deterministic policy state = np.asarray(state) state = state.reshape(1, state.shape[0]) action, = sess.run(actions, feed_dict={state_placeholder: state, \ is_training_placeholder: False}) # add temporally-correlated exploration noise to action # (using an Ornstein-Uhlenbeck process) noise = EXPLORATION_THETA * \ (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA*np.random.randn(ACTION_DIM) action += noise_scale * noise # take step next_state, reward, done, = self.step(action) total_reward += reward add_to_memory(( state, action, reward, next_state, # is next_observation a terminal state? # 0.0 if done and not env.env._past_limit() else # 1.0)) 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if num_steps % TRAIN_EVERY == 0 and \ len(replay_memory) >= MINI_BATCH_SIZE: minibatch = sample_from_memory(MINI_BATCH_SIZE) ''' update the critic and actor params using mean-square value error and deterministic policy gradient, respectively ''' _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray([elem[0] for elem in \ minibatch]), action_placeholder: np.asarray([elem[1] for elem in \ minibatch]), reward_placeholder: np.asarray([elem[2] for elem in \ minibatch]), next_state_placeholder: np.asarray([elem[3] for elem in\ minibatch]), is_not_terminal_placeholder: np.asarray([elem[4] for \ elem in minibatch]), is_training_placeholder: True}) ''' update slow actor and critic targets towards current actor and critic ''' _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: # Increment episode counter _ = sess.run(episode_incr_op) break print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: \ %7.3f' % (episode, total_reward, num_steps_in_episode, \ noise_scale))
class DDPGAgent(): """ Deep deterministic policy gradient agent as described in https://arxiv.org/abs/1509.02971. This agent is meant to operate on low dimensional inputs, not raw pixels. To use the agent, you can get action predictions using act(), and to teach the agent, feed the results to learn. """ def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.local_actor_network = ActorNetwork(state_size, action_size) self.target_actor_network = ActorNetwork(state_size, action_size) self.actor_optimizer = optim.Adam( self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE) # Critic self.local_critic_network = CriticNetwork(state_size, action_size) self.target_critic_network = CriticNetwork(state_size, action_size) self.critic_optimizer = optim.Adam( self.local_critic_network.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess( (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA) def act(self, states, noise=True): """ Returns an action vector based on the current game state. Params ====== states (array_like): A matrix of game states (each row represents the state of an agent) noise (boolean): Add random noise to the predicted action. Aids exploration of the environment during training. """ self.local_actor_network.eval() with torch.no_grad(): actions = self.local_actor_network( torch.tensor(states, dtype=torch.float32)).detach().numpy() self.local_actor_network.train() if noise: actions = actions + self.random_process.sample() actions = np.clip(actions, -1, 1) return actions def vectorize_experiences(self, experiences): """Vectorizes experience objects for use by pytorch Params ====== experiences (array_like of Experience objects): Experiences to vectorize """ states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None ]).astype(np.uint8)).float().to(self.device) return (states, actions, rewards, next_states, dones) def normalize(self, to_normalize): """ Normalize the each row of the input along the 0 dimension using the formula (value - mean)/std Params ====== to_normalize (array_like): Values to normalize """ std = to_normalize.std(0) mean = to_normalize.mean(0) return (to_normalize - mean) / (std + 1e-5) def soft_update(self, target_parameters, local_parameters): """ Updates the given target network parameters with the local parameters using a soft update strategy: tau * local + (1-tau) * target """ for target, local in zip(target_parameters, local_parameters): target.data.copy_(TAU * local.data + (1.0 - TAU) * target.data) def train(self, experiences): """ Trains the actor and critic networks using a minibatch of experiences Params ====== experiences (array_like of Experience): Minibatch of experiences """ states, actions, rewards, next_states, dones = self.vectorize_experiences( experiences) #states = self.normalize(states) #next_states = self.normalize(next_states) rewards = self.normalize(rewards) # Use the target critic network to calculate a target q value next_actions = self.target_actor_network(next_states) q_target = rewards + GAMMA * self.target_critic_network( next_states, next_actions) * (1 - dones) # Calculate the predicted q value q_predicted = self.local_critic_network(states, actions) # Update critic network critic_loss = F.mse_loss(q_predicted, q_target) #print(critic_loss) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.local_critic_network.parameters(), 1) self.critic_optimizer.step() # Update predicted action using policy gradient actions_predicted = self.local_actor_network(states) #print(self.local_critic_network(states, actions_predicted).mean()) policy_loss = -self.local_critic_network(states, actions_predicted).mean() self.actor_optimizer.zero_grad() policy_loss.backward() #print(policy_loss) self.actor_optimizer.step() self.soft_update(self.target_actor_network.parameters(), self.local_actor_network.parameters()) self.soft_update(self.target_critic_network.parameters(), self.local_critic_network.parameters()) def learn(self, experience): """ Tells the agent to learn from an experience. This may not immediately result in training since this agent uses a replay buffer. Params ====== experience (Experience): An experience used to teach the agent. """ self.replay_buffer.add(experience) self.steps += 1 if self.steps % STEPS_BETWEEN_TRAINING == 0 and len( self.replay_buffer) >= BATCH_SIZE: for i in range(ITERATIONS_PER_TRAINING): self.train(self.replay_buffer.sample(BATCH_SIZE)) def save(self, filename): """Saves learned params of underlying networks to a checkpoint file. Params ====== filename (string): Target file. agent- and critic- are prepended for the agent and critic network, respectively """ torch.save(self.local_actor_network.state_dict(), "actor-" + filename) torch.save(self.local_critic_network.state_dict(), "critic-" + filename) def load(self, filename): """Loads learned params generated by save() into underlying networks. filename (string): Path to file. There should be an agent- and critic- version of this file. """ self.local_actor_network.load_state_dict( torch.load("actor-" + filename)) self.target_actor_network.load_state_dict( torch.load("actor-" + filename)) self.local_critic_network.load_state_dict( torch.load("critic-" + filename)) self.target_critic_network.load_state_dict( torch.load("critic-" + filename)) def end_episode(self): """ Tell the agent that an episode is complete. """ self.random_process.reset() self.steps = 0
class Agent: def __init__(self, input_dims, n_actions, env, fc1_dims, fc2_dims, alpha, beta, gamma, tau, noise1, noise2, clamp, delay, max_size, batch_size, warmup): self.gamma = gamma self.tau = tau self.noise1 = noise1 self.noise2 = noise2 self.clamp = clamp self.delay = delay self.batch_size = batch_size self.warmup = warmup self.learn_cntr = 0 self.env = env self.n_actions = n_actions self.actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Target_Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.memory = ReplayBuffer( max_size=max_size, input_shape=input_dims, n_actions=n_actions) self.update_target_networks() def update_target_networks(self): tau = self.tau actor = dict(self.actor.named_parameters()) critic_1 = dict(self.critic_1.named_parameters()) critic_2 = dict(self.critic_2.named_parameters()) target_actor = dict(self.target_actor.named_parameters()) target_critic_1 = dict(self.target_critic_1.named_parameters()) target_critic_2 = dict(self.target_critic_2.named_parameters()) for name in actor: actor[name] = tau*actor[name].clone() + (1-tau)*target_actor[name].clone() for name in critic_1: critic_1[name] = tau*critic_1[name].clone() + (1-tau)*target_critic_1[name].clone() for name in critic_2: critic_2[name] = tau*critic_2[name].clone() + (1-tau)*target_critic_2[name].clone() self.target_actor.load_state_dict(actor) self.target_critic_1.load_state_dict(critic_1) self.target_critic_2.load_state_dict(critic_2) def choose_action(self, observation): if self.learn_cntr < self.warmup: mu = np.random.normal(scale=self.noise1, size=self.n_actions) mu = T.tensor(mu).to(self.actor.device) else: state = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state) noise = T.tensor(np.random.normal(scale=self.noise1, size=self.n_actions), dtype=T.float).to(self.actor.device) mu_ = T.clamp(T.add(mu, noise), min=self.env.action_space.low[0], max=self.env.action_space.high[0]) self.learn_cntr += 1 return mu_.cpu().detach().numpy() def save_models(self): self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.target_actor.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_actor.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint() def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample(self): states, actions, rewards, states_, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(states, dtype=T.float).to(self.critic_1.device) actions = T.tensor(actions, dtype=T.float).to(self.critic_1.device) rewards = T.tensor(rewards, dtype=T.float).to(self.critic_1.device) states_ = T.tensor(states_, dtype=T.float).to(self.critic_1.device) done = T.tensor(done, dtype=T.int).to(self.critic_1.device) return states, actions, rewards, states_, done def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, states_, done = self.sample() Vs1 = self.critic_1.forward(states, actions) Vs2 = self.critic_2.forward(states, actions) actions_ = self.target_actor.forward(states_) noise = T.tensor(np.random.normal(scale=self.noise1, size=self.n_actions), dtype=T.float).to(self.actor.device) noise = T.clamp(noise, min=-self.clamp, max=self.clamp) actions_ = T.add(actions_, noise) actions_ = T.clamp(actions_, min=self.env.action_space.low[0], max=self.env.action_space.high[0]) critic_1_Vs_ = self.target_critic_1.forward(states_, actions_) critic_2_Vs_ = self.target_critic_2.forward(states_, actions_) min_Vs_ = T.min(critic_1_Vs_, critic_2_Vs_) target = rewards + self.gamma*min_Vs_*(1-done) self.critic_1.optim.zero_grad() self.critic_2.optim.zero_grad() critic_1_loss = F.mse_loss(Vs1, target) critic_2_loss = F.mse_loss(Vs2, target) critic_loss = T.add(critic_1_loss, critic_2_loss) critic_loss.backward() self.critic_1.optim.step() self.critic_2.optim.step() if self.learn_cntr % self.delay == 0: self.actor.optim.zero_grad() actor_loss = self.critic_1.forward(states_, self.actor.forward(states_)) actor_loss = -T.mean(actor_loss) actor_loss.backward() self.actor.optim.step() self.update_target_networks()
class DDPG: """docstring for DDPG""" def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings()) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def __init__(self, input_dims, n_actions, env, fc1_dims, fc2_dims, alpha, beta, gamma, tau, noise1, noise2, clamp, delay, max_size, batch_size, warmup): self.gamma = gamma self.tau = tau self.noise1 = noise1 self.noise2 = noise2 self.clamp = clamp self.delay = delay self.batch_size = batch_size self.warmup = warmup self.learn_cntr = 0 self.env = env self.n_actions = n_actions self.actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Target_Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.memory = ReplayBuffer( max_size=max_size, input_shape=input_dims, n_actions=n_actions) self.update_target_networks()
class RDPG: """docstring for RDPG""" def __init__(self, env): self.name = 'RDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.saver = tf.train.Saver() def train(self): # Sample a random minibatch of N sequences from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # Construct histories observations = [] next_observations = [] actions = [] rewards = [] dones = [] for each in minibatch: for i in range(1, len(each.observations)): observations.append(self.pad(each.observations[0:i])) next_observations.append(self.pad(each.observations[1, i + 1])) actions.append(each.actions[0:i - 1]) rewards.append(each.rewards[0:i]) if i == len(each.observations) - 1: dones.append(True) else: dones.append(False) # Calculate y_batch next_action_batch = self.actor_network.target_action(observations) q_value_batch = self.critic_network.target_q( next_observations, [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)]) y_batch = [] for i in range(len(observations)): if dones[i]: y_batch.append(rewards[i][-1]) else: y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [len(observations), 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, observations, [self.pad(i) for i in actions]) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(observations) q_gradient_batch = self.critic_network.gradients( observations, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, observations) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def save_model(self, path, episode): self.saver.save(self.sess, path + "modle.ckpt", episode) def noise_action(self, history): # Select action a_t according to a sequence of observation and action action = self.actor_network.action(history) return action + self.exploration_noise.noise() def action(self, history): action = self.actor_network.action(history) return action def perceive(self, history): # Store the history sequence in the replay buffer self.replay_buffer.add(history) # Store history to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def pad(self, input): dim = len(input[0]) return input + [[0] * dim] * (1000 - len(input))
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self,observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch,[BATCH_SIZE,1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high) def set_feedback(self,observation,action,reward,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append((self.state,action,reward,next_state,done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self, observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer, BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch, [BATCH_SIZE, 1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) / BATCH_SIZE self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action + self.exploration_noise.noise(), self.environment.action_space.low, self.environment.action_space.high) def set_feedback(self, observation, action, reward, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append( (self.state, action, reward, next_state, done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def main(): ''' Create the environment ''' env = gym.make(ENV_NAME) # For tensorboard writer = tf.summary.FileWriter("./tensorboard") assert STATE_DIM == np.prod(np.array(env.observation_space.shape)) assert ACTION_DIM == np.prod(np.array(env.action_space.shape)) env.seed(0) np.random.seed(0) ''' Create the replay memory ''' replay_memory = Memory(REPLAY_MEM_CAPACITY) # Tensorflow part starts here! tf.reset_default_graph() ''' Create placeholders ''' # Placeholders state_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, STATE_DIM], name='state_placeholder') action_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, ACTION_DIM], name='action_placeholder') reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_placeholder') next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM], name='next_state_placeholder') is_not_terminal_placeholder = tf.placeholder( dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder') is_training_placeholder = tf.placeholder(dtype=tf.float32, shape=(), name='is_training_placeholder') ''' A counter to count the number of episodes ''' episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) ''' Create the actor network inside the actor scope and calculate actions ''' with tf.variable_scope('actor'): actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_actions = actor.call(state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' actions = scale_actions(unscaled_actions, env.action_space.low, env.action_space.high) ''' Create the target actor network inside target_actor scope and calculate the target actions. Apply stop_gradient to the target actions so that thier gradient is not computed at any point of time. ''' with tf.variable_scope('target_actor', reuse=False): target_actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_target_actions = target_actor.call(next_state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' target_actions_temp = scale_actions(unscaled_target_actions, env.action_space.low, env.action_space.low) target_actions = tf.stop_gradient(target_actions_temp) ''' Create the critic network inside the critic variable scope. Get the Q-values of given actions and Q-values of actions suggested by the actor network. ''' with tf.variable_scope('critic'): critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) q_values_of_suggested_actions = critic.call(state_placeholder, actions) ''' Create the target critic network inside the target_critic variable scope. Calculate the target Q-values and apply stop_gradient to it. ''' with tf.variable_scope('target_critic', reuse=False): target_critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) target_q_values_temp = target_critic.call(next_state_placeholder, target_actions) target_q_values = tf.stop_gradient(target_q_values_temp) ''' Calculate - trainable variables in actor (Weights of actor network), - Weights of target actor network - trainable variables in critic (Weights of critic network), - Weights of target critic network ''' actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') ''' Get the operators for updating the target networks. The update_target_networks function defined in utils returns a list of operators to be run from tf session inorder to update the target networks using soft update. ''' update_targets_op = update_target_networks(TAU, \ target_actor_vars, actor_vars, target_critic_vars, \ critic_vars) ''' Create the tf operation to train the critic network: - calculate TD-target - calculate TD-Error = TD-target - q_values_of_given_actions - calculate Critic network's loss (Mean Squared Error of TD-Errors) - ? - create a tf operation to train the critic network ''' targets = tf.expand_dims(reward_placeholder, 1) + \ tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \ target_q_values td_errors = targets - q_values_of_given_actions critic_loss = tf.reduce_mean(tf.square(td_errors)) # Update critic networks after computing loss for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # optimize critic critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) ''' Create a tf operation to train the actor networks - Calculate the Actor network's loss - Create the tf operation to train the actor network ''' # Actor's loss actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) # Optimize actor actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss, var_list=actor_vars) # Init session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer.add_graph(sess.graph) # Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (env.action_space.high - env.action_space.low) # Initial state state = env.reset() for _ in range(MAX_STEPS_PER_EPISODE): action = sess.run(actions, feed_dict={ \ state_placeholder: state[None], is_training_placeholder: False}) # Add Noise to actions noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA * np.random.randn(ACTION_DIM) action += noise_scale * noise # Take action on env next_state, reward, done, _info = env.step(action) next_state = np.squeeze(next_state) reward = np.squeeze(reward) action = action[0] total_reward += reward replay_memory.add_to_memory( (state, action, reward, next_state, 0.0 if done else 1.0)) if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \ MINI_BATCH_SIZE : batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE) _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray( \ [elem[0] for elem in batch]), action_placeholder: np.asarray( \ [elem[1] for elem in batch]), reward_placeholder: np.asarray( \ [elem[2] for elem in batch]), next_state_placeholder: np.asarray( \ [elem[3] for elem in batch]), is_not_terminal_placeholder: np.asarray( \ [elem[4] for elem in batch]), is_training_placeholder: True }) _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: _ = sess.run(episode_incr_op) break print(str((episode, total_reward, num_steps_in_episode, noise_scale))) env.close()
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return