def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.local_actor_network = ActorNetwork(state_size, action_size) self.target_actor_network = ActorNetwork(state_size, action_size) self.actor_optimizer = optim.Adam( self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE) # Critic self.local_critic_network = CriticNetwork(state_size, action_size) self.target_critic_network = CriticNetwork(state_size, action_size) self.critic_optimizer = optim.Adam( self.local_critic_network.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess( (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.actor = ActorNetwork(state_size, action_size) self.actor_target = ActorNetwork(state_size, action_size) self.soft_update(self.actor_target.parameters(), self.actor.parameters(), 1) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LEARNING_RATE) # Create one critic per agent self.critics = [] self.critic_targets = [] self.critic_optimizers = [] for i in range(num_agents): # Critic # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated critic = CriticNetwork(state_size * num_agents, action_size * num_agents) self.critics.append(critic) self.critic_targets.append( CriticNetwork(state_size * num_agents, action_size * num_agents)) self.soft_update(self.critic_targets[-1].parameters(), critic.parameters(), 1) self.critic_optimizers.append( optim.Adam(critic.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY)) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess((1, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights")
def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic')
def __init__(self, input_dim, action_dim, critic_layers, actor_layers, actor_activation, scope='ac_network'): self.input_dim = input_dim self.action_dim = action_dim self.scope = scope self.x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='x') self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y') with tf.variable_scope(scope): self.actor_network = ActorNetwork(self.x, action_dim, hidden_layers=actor_layers, activation=actor_activation) self.critic_network = CriticNetwork( self.x, self.actor_network.get_output_layer(), hidden_layers=critic_layers) self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._build()
def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {}
def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
def create_multi_agents(self, sess, num_agents, state_dim, action_dim): agents = [] nets = None for ii in range(num_agents): agent_name = 'agent' + str(ii) agents.append( ActorNetwork(sess, state_dim, action_dim, agent_name, nets)) nets = agents[-1].nets return agents
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0
def __init__(self): self._init_setup() self.viewer = None self.action_space = spaces.Box(self.act_low, self.act_high) self.observation_space = spaces.Box(self.obs_low, self.obs_high) self._seed() self._reset() self.dt = 0.01 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.observation_space.shape[0], self.action_space.shape[0]) self.goal_state = np.zeros(shape=3)
def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[])
def __init__(self, env): self.sess = tf.InteractiveSession() #self.params = loadparams() # ??? self.env = env self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.low = self.env.action_space.low self.high = self.env.action_space.high self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions) self.trainable_var_count = self.actor_network.get_trainable_var_count() self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \ self.actor_network, self.trainable_var_count) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']??? self.exploration_noise = OUNoise(self.n_actions) # self.noise = Noise() self.gamma = GAMMA self.sess.run(tf.global_variables_initializer())
def __init__(self, state_dim, action_dim): self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def add_agents(self, add_num): for ii in range(add_num): #self.num_agents+=1 agent_name = 'agent' + str(self.num_agents) self.agents.append( ActorNetwork(self.sess, self.state_dim, self.action_dim, agent_name, self.agents[-1].nets)) # the agents' name is from 0-num_agents-1 self.num_agents += 1 # if add a new agent then reset the noise and replay buffer self.exploration_noise = OUNoise((self.num_agents, self.action_dim)) #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.replay_buffer.erase() # re-create a saver # the new saver will contains all the savable variables. # otherwise only contains the initially created agents self.saver = tf.train.Saver()
def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0
def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8)
def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings())
def __init__(self, env_name, sess, state_dim, action_dim, models_dir, img_dim): self.name = 'DDPG' self.env_name = env_name self.state_dim = state_dim self.action_dim = action_dim self.img_dim = img_dim self.models_dir = models_dir # Ensure action bound is symmetric self.time_step = 0 self.sess = sess self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.saver = tf.train.Saver()
def __init__(self, env,loadfilename=None,printVars=False): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) #print 'init complete' self.all_vars = tf.global_variables() if printVars: for v in self.all_vars: print v.name.ljust(30), v.shape self.saver = tf.train.Saver(self.all_vars) if loadfilename is not None: self.saver.restore(self.sess, loadfilename)
def __init__(self, env): # ------------------- init the (NN) & (Buf) & (explor noise) & (counter) ------------------- self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env[0] self.action_dim = env[1] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.epsilon_max = 1.0 self.epsilon_min = 0.01 self.counter = 0
lr = 0.0001 # learning rate ENV_NAME = 'Pendulum-v0' if __name__ == "__main__": env = gym.make(ENV_NAME) env.seed(1) env = env.unwrapped # Get state and action dimension state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] # Initialize actor, critic and target networks actor = ActorNetwork(action_dim=action_dim) critic = CriticNetwork() target_mu = TargetNetMu(actor) target_q = TargetNetQ(critic) # Initialize buffer memory = Memory(capacity=buffer_size, dims=2 * state_dim + action_dim + 1) # Total loss for critic total_critic_loss = 0 total_transition_trained_on = 0 # Outer iteration for m in range(M): # Receive initial observation
def main(): ''' Create the environment ''' env = gym.make(ENV_NAME) # For tensorboard writer = tf.summary.FileWriter("./tensorboard") assert STATE_DIM == np.prod(np.array(env.observation_space.shape)) assert ACTION_DIM == np.prod(np.array(env.action_space.shape)) env.seed(0) np.random.seed(0) ''' Create the replay memory ''' replay_memory = Memory(REPLAY_MEM_CAPACITY) # Tensorflow part starts here! tf.reset_default_graph() ''' Create placeholders ''' # Placeholders state_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, STATE_DIM], name='state_placeholder') action_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, ACTION_DIM], name='action_placeholder') reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_placeholder') next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM], name='next_state_placeholder') is_not_terminal_placeholder = tf.placeholder( dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder') is_training_placeholder = tf.placeholder(dtype=tf.float32, shape=(), name='is_training_placeholder') ''' A counter to count the number of episodes ''' episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) ''' Create the actor network inside the actor scope and calculate actions ''' with tf.variable_scope('actor'): actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_actions = actor.call(state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' actions = scale_actions(unscaled_actions, env.action_space.low, env.action_space.high) ''' Create the target actor network inside target_actor scope and calculate the target actions. Apply stop_gradient to the target actions so that thier gradient is not computed at any point of time. ''' with tf.variable_scope('target_actor', reuse=False): target_actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_target_actions = target_actor.call(next_state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' target_actions_temp = scale_actions(unscaled_target_actions, env.action_space.low, env.action_space.low) target_actions = tf.stop_gradient(target_actions_temp) ''' Create the critic network inside the critic variable scope. Get the Q-values of given actions and Q-values of actions suggested by the actor network. ''' with tf.variable_scope('critic'): critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) q_values_of_suggested_actions = critic.call(state_placeholder, actions) ''' Create the target critic network inside the target_critic variable scope. Calculate the target Q-values and apply stop_gradient to it. ''' with tf.variable_scope('target_critic', reuse=False): target_critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) target_q_values_temp = target_critic.call(next_state_placeholder, target_actions) target_q_values = tf.stop_gradient(target_q_values_temp) ''' Calculate - trainable variables in actor (Weights of actor network), - Weights of target actor network - trainable variables in critic (Weights of critic network), - Weights of target critic network ''' actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') ''' Get the operators for updating the target networks. The update_target_networks function defined in utils returns a list of operators to be run from tf session inorder to update the target networks using soft update. ''' update_targets_op = update_target_networks(TAU, \ target_actor_vars, actor_vars, target_critic_vars, \ critic_vars) ''' Create the tf operation to train the critic network: - calculate TD-target - calculate TD-Error = TD-target - q_values_of_given_actions - calculate Critic network's loss (Mean Squared Error of TD-Errors) - ? - create a tf operation to train the critic network ''' targets = tf.expand_dims(reward_placeholder, 1) + \ tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \ target_q_values td_errors = targets - q_values_of_given_actions critic_loss = tf.reduce_mean(tf.square(td_errors)) # Update critic networks after computing loss for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # optimize critic critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) ''' Create a tf operation to train the actor networks - Calculate the Actor network's loss - Create the tf operation to train the actor network ''' # Actor's loss actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) # Optimize actor actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss, var_list=actor_vars) # Init session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer.add_graph(sess.graph) # Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (env.action_space.high - env.action_space.low) # Initial state state = env.reset() for _ in range(MAX_STEPS_PER_EPISODE): action = sess.run(actions, feed_dict={ \ state_placeholder: state[None], is_training_placeholder: False}) # Add Noise to actions noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA * np.random.randn(ACTION_DIM) action += noise_scale * noise # Take action on env next_state, reward, done, _info = env.step(action) next_state = np.squeeze(next_state) reward = np.squeeze(reward) action = action[0] total_reward += reward replay_memory.add_to_memory( (state, action, reward, next_state, 0.0 if done else 1.0)) if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \ MINI_BATCH_SIZE : batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE) _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray( \ [elem[0] for elem in batch]), action_placeholder: np.asarray( \ [elem[1] for elem in batch]), reward_placeholder: np.asarray( \ [elem[2] for elem in batch]), next_state_placeholder: np.asarray( \ [elem[3] for elem in batch]), is_not_terminal_placeholder: np.asarray( \ [elem[4] for elem in batch]), is_training_placeholder: True }) _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: _ = sess.run(episode_incr_op) break print(str((episode, total_reward, num_steps_in_episode, noise_scale))) env.close()
def __init__(self, input_dims, n_actions, env, fc1_dims, fc2_dims, alpha, beta, gamma, tau, noise1, noise2, clamp, delay, max_size, batch_size, warmup): self.gamma = gamma self.tau = tau self.noise1 = noise1 self.noise2 = noise2 self.clamp = clamp self.delay = delay self.batch_size = batch_size self.warmup = warmup self.learn_cntr = 0 self.env = env self.n_actions = n_actions self.actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Target_Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.memory = ReplayBuffer( max_size=max_size, input_shape=input_dims, n_actions=n_actions) self.update_target_networks()
import tensorflow as tf import numpy as np from criticnetwork import CriticNetwork from actor_network import ActorNetwork state_dim = 2 action_dim = 3 batch_size = 4 GAMMA = .9 num_agents = 5 nets = None agents = [] sess = tf.InteractiveSession() for ii in range(num_agents): agent_name = 'agent' + str(ii) agents.append(ActorNetwork(sess, state_dim, action_dim, agent_name, nets)) nets = agents[-1].nets critic = CriticNetwork(sess, state_dim, action_dim) # take action current_states = np.random.rand(1, num_agents, state_dim) current_action = np.zeros((1, num_agents, action_dim)) for ii in range(num_agents): current_action[0, ii, :] = agents[ii].actions( np.reshape(current_states[0, ii, :], [-1, state_dim])) Rt = np.random.rand(1, num_agents) next_state = np.random.rand(1, num_agents, state_dim) next_action = np.zeros((1, num_agents, action_dim)) for ii in range(num_agents):
def run_ddpg(amodel, cmodel, train_indicator=0, seeded=1337, track_name='practgt2.xml'): OU = FunctionOU() BUFFER_SIZE = 100000 BATCH_SIZE = 32 GAMMA = 0.99 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic ALPHA = 0.9 action_dim = 3 # Steering/Acceleration/Brake state_dim = 29 # of sensors input np.random.seed(seeded) vision = False EXPLORE = 100000. if train_indicator: episode_count = 600 else: episode_count = 3 max_steps = 20000 reward = 0 done = False step = 0 epsilon = 1 indicator = 0 # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC) buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer # Generate a Torcs environment env = TorcsEnv(vision=vision, throttle=True, gear_change=False, track_name=track_name) if not train_indicator: # Now load the weight #logging.info("Now we load the weight") print("Now we load the weight") try: actor.model.load_weights(amodel) critic.model.load_weights(cmodel) actor.target_model.load_weights(amodel) critic.target_model.load_weights(cmodel) #logging.info(" Weight load successfully") print("Weight load successfully") except: #ogging.info("Cannot find the weight") print("Cannot find the weight") exit() #logging.info("TORCS Experiment Start.") print("TORCS Experiment Start.") best_lap = 500 for i_episode in range(episode_count): print("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) #logging.info("Episode : " + str(i_episode) + " Replay Buffer " + str(buff.count())) if np.mod(i_episode, 3) == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() s_t = np.hstack((ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) total_reward = 0. for j_iter in range(max_steps): loss = 0 epsilon -= 1.0 / EXPLORE a_t = np.zeros([1, action_dim]) noise_t = np.zeros([1, action_dim]) a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0])) noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][0], 0.0, 0.60, 0.30) noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][1], 0.5, 1.00, 0.10) noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function( a_t_original[0][2], -0.1, 1.00, 0.05) a_t[0][0] = a_t_original[0][0] + noise_t[0][0] a_t[0][1] = a_t_original[0][1] + noise_t[0][1] a_t[0][2] = a_t_original[0][2] + noise_t[0][2] ob, r_t, done, info = env.step(a_t[0]) s_t1 = np.hstack( (ob.speedX, ob.angle, ob.trackPos, ob.speedY, ob.speedZ, ob.rpm, ob.wheelSpinVel / 100.0, ob.track)) buff.add(s_t, a_t[0], r_t, s_t1, done) # Add replay buffer # Do the batch update batch = buff.getBatch(BATCH_SIZE) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + GAMMA * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic.target_train() total_reward += r_t s_t = s_t1 print("Episode", i_episode, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss) if np.mod(step, 1000) == 0: logging.info("Episode {}, Distance {}, Last Lap {}".format( i_episode, ob.distRaced, ob.lastLapTime)) if ob.lastLapTime > 0: if best_lap < ob.lastLapTime: best_lap = ob.lastLapTime step += 1 if done: break if train_indicator and i_episode > 20: if np.mod(i_episode, 3) == 0: logging.info("Now we save model") actor.model.save_weights("ddpg_actor_weights_periodic.h5", overwrite=True) critic.model.save_weights("ddpg_critic_weights_periodic.h5", overwrite=True) print("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(step)) print("Best Lap {}".format(best_lap)) print("") logging.info("TOTAL REWARD @ " + str(i_episode) + "-th Episode : Reward " + str(total_reward)) logging.info("Best Lap {}".format(best_lap)) env.end() # This is for shutting down TORCS logging.info("Finish.")
def fit(self, *args, **kwargs): MEM_SZ = MEM_SIZE_FCL sess = K.get_session() K.set_learning_phase(1) self.actor = ActorNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRA, convolutional=CONVOLUTIONAL, output_activation=ACTION_ACTIVATION) self.critic = CriticNetwork(sess, self.state_dim, self.nn_action_dim, BATCH_SIZE, TAU, LRC, convolutional=CONVOLUTIONAL) self.memory = Memory(MEM_SZ) self.actor.target_model.summary() self.critic.target_model.summary() if LOAD_WEIGHTS: self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "actor_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX + "critic_target_model_" + LOAD_WEIGHTS_EPISODE + ".h5") print("Weights Loaded!") #==================================================== #Initialize noise processes #self.noise_procs = [] #for i in range(NUM_NOISE_PROCS): # self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV)) #==================================================== PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS steps = STARTING_EPISODE * EPISODE_LENGTH start_time = time.time() last_ep_time = time.time() if MAKE_PLOT: reward_graph = Grapher() for ep in range(STARTING_EPISODE, EPISODES): #reset noise processes #for ou in self.noise_procs: # ou.reset() self.noise.reset() #start time counter if (ep == PRE_LEARNING_EPISODES): start_time = time.time() print("Episode: " + str(ep) + " Frames: " + str(ep * EPISODE_LENGTH) + " Uptime: " + str( (time.time() - start_time) / 3600.0) + " hrs ===========") state = self.env.reset() play_only = (ep % 10 == 0) total_reward = 0 if play_only or ALREADY_TRAINED: for step in range(TEST_EPISODE_LENGTH): #print ">>>>>>>>>>>>>", state.shape #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center #img = np.multiply(img, 1.0/128.0) #scale [-1,1] #img = np.transpose(state, (1,2,0)) #img = np.array(state) #img = np.transpose(img, (1,2,0)) #print ">>>>>>>>>>>>>", state.shape state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction( state, can_be_random=False, use_target=True) nstate, reward, done, info = self.env.step(control_action) total_reward += reward state = nstate else: for step in range(EPISODE_LENGTH): # ACT ============================== epsilon = (float(steps) / float(EPSILON_STEPS)) * ( EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0] state = np.reshape(state, state.shape + (1, )) action, control_action = self.selectAction(state, epsilon=epsilon) new_state, reward, done, info = self.env.step( control_action) done = done or (step >= EPISODE_LENGTH) self.memory.addMemory(state, action, reward, new_state, done) state = new_state # LEARN ============================ if ep > PRE_LEARNING_EPISODES: batch, idxs = self.memory.getMiniBatch(BATCH_SIZE) self.learnFromBatch(batch) if done: break # CLEANUP ========================== steps += 1 #we need to consider the episodes without noise to actually tell how the system is doing if play_only and MAKE_PLOT: reward_graph.addSample(total_reward) reward_graph.displayPlot() #calculate fph on total frames total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH elapsed = time.time() - start_time fps = total_frames / elapsed fph = fps * 3600.0 #re-calculate fps on this episode, so it updates quickly fps = EPISODE_LENGTH / (time.time() - last_ep_time) last_ep_time = time.time() print("fps: " + str(fps) + " fph: " + str(fph) + "\n") #save plot and weights if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY == 0) and not ALREADY_TRAINED: #plot if MAKE_PLOT: reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" + str(ep) + ".jpg") #weights self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".h5", overwrite=True) self.actor.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".h5", overwrite=True) self.critic.model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5", overwrite=True) self.critic.target_model.save_weights( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".h5", overwrite=True) #network structures (although I don't think I ever actually use these) with open( SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.actor.target_model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.model.to_json(), outfile) with open( SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) + ".json", "w") as outfile: json.dump(self.critic.target_model.to_json(), outfile)
state_batch = np.random.rand(batch_size, state_dim) # with tf.Session() as sess: # actor = ActorNetwork(sess,state_dim,action_dim,agent_name,1) # print(actor.actions(state_batch)) # actor.update_target() # print('\n') # print(actor.target_actions(state_batch)) # # actor.train(y_grad,state_batch) # actor.update_target() # print(actor.target_actions(state_batch)) # test create multiple agents # agents = [] # with tf.Session() as sess: # for ii in range(10): # agent_name = 'agent'+str(ii) # print(agent_name) # agents.append(ActorNetwork(sess, state_dim, action_dim, agent_name)) # # print(agents) # test the copy works with tf.Session() as sess: agent1 = ActorNetwork(sess,state_dim,action_dim,'agent1') agent1.train(y_grad,state_batch) agent2 = ActorNetwork(sess, state_dim, action_dim, 'agent2', agent1.nets) print('agent 1', agent1.actions(state_batch)) print('agent 2', agent2.actions(state_batch))