def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.local_actor_network = ActorNetwork(state_size, action_size) self.target_actor_network = ActorNetwork(state_size, action_size) self.actor_optimizer = optim.Adam( self.local_actor_network.parameters(), lr=ACTOR_LEARNING_RATE) # Critic self.local_critic_network = CriticNetwork(state_size, action_size) self.target_critic_network = CriticNetwork(state_size, action_size) self.critic_optimizer = optim.Adam( self.local_critic_network.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess( (num_agents, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
def __init__(self, state_size, action_size, num_agents): """ Initialize agent. Params ====== state_size (integer): Size of input state vector action_size (integer): Size of action vector num_agents (integer): Number of simultaneous agents in the environment """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor self.actor = ActorNetwork(state_size, action_size) self.actor_target = ActorNetwork(state_size, action_size) self.soft_update(self.actor_target.parameters(), self.actor.parameters(), 1) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LEARNING_RATE) # Create one critic per agent self.critics = [] self.critic_targets = [] self.critic_optimizers = [] for i in range(num_agents): # Critic # Note: we use action_size * num_agents since we'll pass in the actions of all agents concatenated critic = CriticNetwork(state_size * num_agents, action_size * num_agents) self.critics.append(critic) self.critic_targets.append( CriticNetwork(state_size * num_agents, action_size * num_agents)) self.soft_update(self.critic_targets[-1].parameters(), critic.parameters(), 1) self.critic_optimizers.append( optim.Adam(critic.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=CRITIC_WEIGHT_DECAY)) self.replay_buffer = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, None) self.steps = 0 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.random_process = OrnsteinUhlenbeckProcess((1, action_size), sigma=RANDOM_SIGMA, theta=RANDOM_THETA)
def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, input_dim, action_dim, critic_layers, actor_layers, actor_activation, scope='ac_network'): self.input_dim = input_dim self.action_dim = action_dim self.scope = scope self.x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='x') self.y = tf.placeholder(shape=(None, ), dtype=tf.float32, name='y') with tf.variable_scope(scope): self.actor_network = ActorNetwork(self.x, action_dim, hidden_layers=actor_layers, activation=actor_activation) self.critic_network = CriticNetwork( self.x, self.actor_network.get_output_layer(), hidden_layers=critic_layers) self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._build()
def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic')
def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights")
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights")
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] # self.action_dim = env.action_space.shape[0] self.state_dim = env.state_size self.action_dim = env.action_size self.action_bound = (env.action_high - env.action_low) / 2 print('state_dim: ', self.state_dim, 'action_dim: ', self.action_dim, 'action_bound: ', self.action_bound) self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.action_bound) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.action_bound) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {}
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0
def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)
def __init__(self, sess, data_fname): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, track_name='practgt2.xml'): BUFFER_SIZE = 100000 TAU = 0.001 # Target Network HyperParameters LRA = 0.0001 # Learning rate for Actor LRC = 0.001 # Lerning rate for Critic state_dim = 29 # of sensors input self.batch_size = 32 self.lambda_mix = 10.0 self.action_dim = 3 # Steering/Acceleration/Brake # Tensorflow GPU optimization config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRA) self.critic = CriticNetwork(sess, state_dim, self.action_dim, self.batch_size, TAU, LRC) self.buff = ReplayBuffer(BUFFER_SIZE) # Create replay buffer self.track_name = track_name self.save = dict(total_reward=[], total_step=[], ave_reward=[], distRaced=[], distFromStart=[], lastLapTime=[], curLapTime=[], lapTimes=[], avelapTime=[], ave_sp=[], max_sp=[], min_sp=[], test_total_reward=[], test_total_step=[], test_ave_reward=[], test_distRaced=[], test_distFromStart=[], test_lastLapTime=[], test_curLapTime=[], test_lapTimes=[], test_avelapTime=[], test_ave_sp=[], test_max_sp=[], test_min_sp=[])
def __init__(self, env): self.sess = tf.InteractiveSession() #self.params = loadparams() # ??? self.env = env self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.low = self.env.action_space.low self.high = self.env.action_space.high self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions) self.trainable_var_count = self.actor_network.get_trainable_var_count() self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \ self.actor_network, self.trainable_var_count) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']??? self.exploration_noise = OUNoise(self.n_actions) # self.noise = Noise() self.gamma = GAMMA self.sess.run(tf.global_variables_initializer())
def __init__(self, env,device): self.name = 'DDPG' # name for uploading results self.environment = env self.device=device # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.actor_network = ActorNetwork(self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, state_dim, action_dim, env): self.name = 'DDPG' # name for uploading results self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.environment = env self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, num_agents, state_dim, action_dim): # track training times self.time_step = 0 # use set session use GPU #self.sess = tf.InteractiveSession() self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=True)) self.num_agents = num_agents self.state_dim = state_dim self.action_dim = action_dim self.agents = self.create_multi_agents(self.sess, num_agents, self.state_dim, self.action_dim) # make sure create Criticnetwork later, summarise mean Q value inside self.critic = CriticNetwork(self.sess, state_dim, action_dim) self.exploration_noise = OUNoise((self.num_agents, action_dim)) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # for store checkpoint self.saver = tf.train.Saver()
def initialize(self): self.net = {} self.opt = {} if self.pars["twin"]: self.twins = ["_1", "_2"] else: self.twins = [""] for netname in ["act_loc", "act_tg"]: self.net[netname] = LinearNetwork( input_shape=self.pars["state_size"], lin_layers=self.pars["actor_layers"], output_shape=self.pars["act_size"], seed=self.pars["seed"]) self.opt["act"] = torch.optim.Adam(self.net["act_loc"].parameters(), lr=self.pars["lr_act"]) for twin in self.twins: for netname in ["crit_loc", "crit_tg"]: self.net[netname + twin] = CriticNetwork( input_shape=self.pars["state_size"], lin_layers=self.pars["crit_layers"], output_shape=(1, ), action_layer=self.pars["act_input"], action_shape=self.pars["act_size"], seed=self.pars["seed"]) self.opt["crit" + twin] = torch.optim.Adam( self.net["crit_loc" + twin].parameters(), lr=self.pars["lr_crit"]) if self.pars["erep_eps"] < 1.0: self.mem = ExperienceReplayer( self.pars["erep_size"], wait_fill=self.pars["erep_fill"], default_prio=self.pars["erep_def_prio"], epsilon=self.pars["erep_eps"]) else: self.mem = ExperienceReplayer(self.pars["erep_size"], wait_fill=self.pars["erep_fill"]) self.train_cr_count = 0
def __init__(self, state_dim, action_dim): self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0
def __init__(self, sess, env, par_idx): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.par_idx = par_idx self.sess = sess with tf.variable_scope("particle_" + str(par_idx)): self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.par_idx) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim)
def __init__(self, env_name, sess, state_dim, action_dim, models_dir, img_dim): self.name = 'DDPG' self.env_name = env_name self.state_dim = state_dim self.action_dim = action_dim self.img_dim = img_dim self.models_dir = models_dir # Ensure action bound is symmetric self.time_step = 0 self.sess = sess self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.saver = tf.train.Saver()
def __init__(self, env,loadfilename=None,printVars=False): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) #print 'init complete' self.all_vars = tf.global_variables() if printVars: for v in self.all_vars: print v.name.ljust(30), v.shape self.saver = tf.train.Saver(self.all_vars) if loadfilename is not None: self.saver.restore(self.sess, loadfilename)
def __init__(self, state_dim, action_dim): """name for uploading resuults""" self.name = 'DDPG' self.time_step = 0 # self.atten_rate = 1 """Randomly initialize actor network and critic network""" """and both their target networks""" self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) """initialize replay buffer""" self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) """Initialize a random process the Ornstein-Uhlenbeck process for action exploration""" self.exploration_noise = OUNoise(self.action_dim) """Initialize a Treading""" self.threading = threading.Thread(target=self.train, name='LoopThread--DDPG')
def __init__(self, env): # ------------------- init the (NN) & (Buf) & (explor noise) & (counter) ------------------- self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env[0] self.action_dim = env[1] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.epsilon_max = 1.0 self.epsilon_min = 0.01 self.counter = 0
mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping), opts.use_cuda) # Overwrite model parameters by parameters to load model.load_state_dict({**model.state_dict(), **load_data.get('model', {})}) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic': baseline = CriticBaseline( maybe_cuda_model( CriticNetwork(problem.NODE_DIM, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization), opts.use_cuda)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(),
def main(): ''' Create the environment ''' env = gym.make(ENV_NAME) # For tensorboard writer = tf.summary.FileWriter("./tensorboard") assert STATE_DIM == np.prod(np.array(env.observation_space.shape)) assert ACTION_DIM == np.prod(np.array(env.action_space.shape)) env.seed(0) np.random.seed(0) ''' Create the replay memory ''' replay_memory = Memory(REPLAY_MEM_CAPACITY) # Tensorflow part starts here! tf.reset_default_graph() ''' Create placeholders ''' # Placeholders state_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, STATE_DIM], name='state_placeholder') action_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, ACTION_DIM], name='action_placeholder') reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_placeholder') next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM], name='next_state_placeholder') is_not_terminal_placeholder = tf.placeholder( dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder') is_training_placeholder = tf.placeholder(dtype=tf.float32, shape=(), name='is_training_placeholder') ''' A counter to count the number of episodes ''' episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) ''' Create the actor network inside the actor scope and calculate actions ''' with tf.variable_scope('actor'): actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_actions = actor.call(state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' actions = scale_actions(unscaled_actions, env.action_space.low, env.action_space.high) ''' Create the target actor network inside target_actor scope and calculate the target actions. Apply stop_gradient to the target actions so that thier gradient is not computed at any point of time. ''' with tf.variable_scope('target_actor', reuse=False): target_actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_target_actions = target_actor.call(next_state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' target_actions_temp = scale_actions(unscaled_target_actions, env.action_space.low, env.action_space.low) target_actions = tf.stop_gradient(target_actions_temp) ''' Create the critic network inside the critic variable scope. Get the Q-values of given actions and Q-values of actions suggested by the actor network. ''' with tf.variable_scope('critic'): critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) q_values_of_suggested_actions = critic.call(state_placeholder, actions) ''' Create the target critic network inside the target_critic variable scope. Calculate the target Q-values and apply stop_gradient to it. ''' with tf.variable_scope('target_critic', reuse=False): target_critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) target_q_values_temp = target_critic.call(next_state_placeholder, target_actions) target_q_values = tf.stop_gradient(target_q_values_temp) ''' Calculate - trainable variables in actor (Weights of actor network), - Weights of target actor network - trainable variables in critic (Weights of critic network), - Weights of target critic network ''' actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') ''' Get the operators for updating the target networks. The update_target_networks function defined in utils returns a list of operators to be run from tf session inorder to update the target networks using soft update. ''' update_targets_op = update_target_networks(TAU, \ target_actor_vars, actor_vars, target_critic_vars, \ critic_vars) ''' Create the tf operation to train the critic network: - calculate TD-target - calculate TD-Error = TD-target - q_values_of_given_actions - calculate Critic network's loss (Mean Squared Error of TD-Errors) - ? - create a tf operation to train the critic network ''' targets = tf.expand_dims(reward_placeholder, 1) + \ tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \ target_q_values td_errors = targets - q_values_of_given_actions critic_loss = tf.reduce_mean(tf.square(td_errors)) # Update critic networks after computing loss for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # optimize critic critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) ''' Create a tf operation to train the actor networks - Calculate the Actor network's loss - Create the tf operation to train the actor network ''' # Actor's loss actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) # Optimize actor actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss, var_list=actor_vars) # Init session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer.add_graph(sess.graph) # Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (env.action_space.high - env.action_space.low) # Initial state state = env.reset() for _ in range(MAX_STEPS_PER_EPISODE): action = sess.run(actions, feed_dict={ \ state_placeholder: state[None], is_training_placeholder: False}) # Add Noise to actions noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA * np.random.randn(ACTION_DIM) action += noise_scale * noise # Take action on env next_state, reward, done, _info = env.step(action) next_state = np.squeeze(next_state) reward = np.squeeze(reward) action = action[0] total_reward += reward replay_memory.add_to_memory( (state, action, reward, next_state, 0.0 if done else 1.0)) if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \ MINI_BATCH_SIZE : batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE) _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray( \ [elem[0] for elem in batch]), action_placeholder: np.asarray( \ [elem[1] for elem in batch]), reward_placeholder: np.asarray( \ [elem[2] for elem in batch]), next_state_placeholder: np.asarray( \ [elem[3] for elem in batch]), is_not_terminal_placeholder: np.asarray( \ [elem[4] for elem in batch]), is_training_placeholder: True }) _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: _ = sess.run(episode_incr_op) break print(str((episode, total_reward, num_steps_in_episode, noise_scale))) env.close()
opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping), opts.use_cuda) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({**model_.state_dict(), **load_data.get('model', {})}) # Initialize baseline baseline = CriticBaseline( maybe_cuda_model( CriticNetwork(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization), opts.use_cuda)) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Start the actual training loop # val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset) # torch.save(val_dataset,'test_data/myval_20.pt') # val_dataset = torch.load('test_data/myval_20.pt') if opts.eval_only: total_cost, return_return, best = validate(model, val_dataset, opts) print('Improving: {} +- {}'.format(
def __init__(self, input_dims, n_actions, env, fc1_dims, fc2_dims, alpha, beta, gamma, tau, noise1, noise2, clamp, delay, max_size, batch_size, warmup): self.gamma = gamma self.tau = tau self.noise1 = noise1 self.noise2 = noise2 self.clamp = clamp self.delay = delay self.batch_size = batch_size self.warmup = warmup self.learn_cntr = 0 self.env = env self.n_actions = n_actions self.actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_actor = ActorNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, alpha=alpha, name='Target_Actor_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_1 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_1_TD3PG.cpt', checkpoint_dir='tmp/models') self.target_critic_2 = CriticNetwork( input_shape=input_dims, n_actions=n_actions, fc1_dims=fc1_dims, fc2_dims=fc2_dims, beta=beta, name='Target_Critic_2_TD3PG.cpt', checkpoint_dir='tmp/models') self.memory = ReplayBuffer( max_size=max_size, input_shape=input_dims, n_actions=n_actions) self.update_target_networks()