def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'td3': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path)
def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, ent_alpha=0.02, batch_size=256, reward_scale=2, layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.ent_alpha = ent_alpha self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, name='actor', max_action=env.action_space.high, chkpt_dir=chkpt_dir) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, name='critic_1', chkpt_dir=chkpt_dir) self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, name='critic_2', chkpt_dir=chkpt_dir) self.value = ValueNetwork(beta, input_dims, fc1_dims=layer1_size, fc2_dims=layer2_size, name='value', chkpt_dir=chkpt_dir) self.target_value = ValueNetwork(beta, input_dims, fc1_dims=layer1_size, fc2_dims=layer2_size, name='target_value', chkpt_dir=chkpt_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fc1=400, fc2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = ActorNetwork(n_actions=n_actions, name='actor') self.critic = CriticNetwork(name='critic') self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(name='target_critic') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic.compile(optimizer=Adam(learning_rate=beta)) self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=Adam(learning_rate=beta)) self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, actor_dims, critic_dims, n_actions, n_agents, agent_idx, chkpt_dir, alpha=0.01, beta=0.01, fc1=64, fc2=64, gamma=0.95, tau=0.01): self.gamma = gamma self.tau = tau self.n_actions = n_actions self.agent_name = 'agent_%s' % agent_idx self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_actor') self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_critic') self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_actor') self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_critic') self.update_network_parameters(tau=1)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, max_size=1000000, fc1_dims=400, fc2_dims=300, batch_size=64): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='actor') self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='critic') self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_critic') self.update_network_parameters(tau=1)
def __init__(self, docker_client, name='worker', port=3101, model_path='../models/ddpg', log_path='../logs/ddpg'): self.state_size = 29 self.action_size = 3 self.docker_client = docker_client self.buffer_size = 100000 self.batch_size = 32 self.gamma = 0.99 # disocunt factor self.tau = 0.001 # Target Network HyperParameters self.lra = 0.0001 # Learning rate for Actor self.lrc = 0.001 # Lerning rate for Critic seed(6486) self.explore = 100000. self.episode_count = 2000 self.max_steps = 10000 self.epsilon = 1 self.model_path = model_path self.port = port self.name = name if not os.path.exists(self.model_path): os.makedirs(self.model_path) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True tf.reset_default_graph() self.summary_writer = tf.summary.FileWriter(log_path) self.actor = ActorNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lra), self.tau) self.critic = CriticNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lrc), self.tau) self.buff = ReplayBuffer(self.buffer_size) self.saver = tf.train.Saver() self._create_summary() self.summary_histogram = tf.summary.merge_all()
def main(_): ''' previous = tf.train.import_meta_graph(SAVE_DIR + '/model.ckpt.meta') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) previous.restore(sess,tf.train.latest_checkpoint(SAVE_DIR+'/')) last_vars = tf.trainable_variables() data = sess.run(last_vars) print('Model Restored') ''' tf.reset_default_graph() with tf.Session() as sess: env = Preon_env(opt.env_params) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) state_dim = 9 action_dim = 3 goal_dim = 2 actor = ActorNetwork(sess, state_dim, action_dim, goal_dim, ACTOR_LEARNING_RATE, TAU, opt.env_params) critic = CriticNetwork(sess, state_dim, action_dim, goal_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), opt.env_params) if opt.train: train(sess, env, actor, critic, action_dim, goal_dim, state_dim) else: test(sess, env, actor, critic, action_dim, goal_dim, state_dim, opt.test_goal)
def __init__(self, alpha, beta, input_dims, action_bound, tau, env, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=64): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.action_bound = action_bound self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Actor') self.critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Critic') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor') self.target_critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_network_parameters(tau=1)
def __init__(self, n_actions, n_states, obs_shape, gamma=0.99, lr=0.0003, gae_lambda=0.95, entropy_coeff=0.0005, ppo_clip=0.2, mini_batch_size=64, n_epochs=10, clip_value_loss=True, normalize_observation=False, stop_normalize_obs_after_timesteps=50000, fc1=64, fc2=64, environment='None', run=0): self.entropy_coeff = entropy_coeff self.clip_value_loss = clip_value_loss self.gamma = gamma self.ppo_clip = ppo_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.normalize_observation = normalize_observation self.stop_obs_timesteps = stop_normalize_obs_after_timesteps self.timestep = 0 self.actor = ActorNetwork(n_states=n_states, n_actions=n_actions, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.critic = CriticNetwork(n_states=n_states, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.memory = PPOMemory(mini_batch_size, gamma, gae_lambda) self.running_stats = RunningStats(shape_states=obs_shape, chkpt_dir=environment, run=run)
def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \ n_actions=2, max_size=1000000, layer1_size=400, \ layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_actor', max_action=max_action, chkpt_dir=path_dir) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_1', chkpt_dir=path_dir) self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_2', chkpt_dir=path_dir) self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_value', chkpt_dir=path_dir) self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_target_value', chkpt_dir=path_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, state_size: int, action_size: int, gamma: float = 0.99, lr_actor: float = 0.001, lr_critic: float = 0.003, weight_decay: float = 0.0001, tau: float = 0.001, buffer_size: int = 100000, batch_size: int = 64): """ :param state_size: how many states does the agent get as input (input size of neural networks) :param action_size: from how many actions can the agent choose :param gamma: discount factor :param lr_actor: learning rate of the actor network :param lr_critic: learning rate of the critic network :param weight_decay: :param tau: soft update parameter :param buffer_size: size of replay buffer :param batch_size: size of learning batch (mini-batch) """ self.tau = tau self.gamma = gamma self.batch_size = batch_size self.actor_local = ActorNetwork(state_size, action_size).to(device) self.actor_target = ActorNetwork(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) print(self.actor_local) self.critic_local = CriticNetwork(state_size, action_size).to(device) self.critic_target = CriticNetwork(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) print(self.critic_local) self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.memory = ReplayBuffer(action_size, buffer_size, batch_size) # this would probably also work with Gaussian noise instead of Ornstein-Uhlenbeck process self.noise = OUNoise(action_size)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) #sets the parameters of Target-network equals to the
def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, n_epochs=10, chkpt_dir='models/'): self.gamma = gamma self.policy_clip = policy_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.chkpt_dir = chkpt_dir self.actor = ActorNetwork(n_actions) self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic = CriticNetwork() self.critic.compile(optimizer=Adam(learning_rate=alpha)) self.memory = PPOMemory(batch_size)
def __init__(self, logger, replay_buffer): super(Learner, self).__init__(name="Learner") self.device = Params.DEVICE with tf.device(self.device), self.name_scope: self.dtype = Params.DTYPE self.logger = logger self.batch_size = Params.MINIBATCH_SIZE self.gamma = Params.GAMMA self.tau = Params.TAU self.replay_buffer = replay_buffer self.priority_beta = tf.Variable(Params.BUFFER_PRIORITY_BETA_START) self.running = tf.Variable(True) self.n_steps = tf.Variable(0) # Init Networks self.actor = ActorNetwork(with_target_net=True) self.critic = CriticNetwork() # Save shared variables self.policy_variables = self.actor.tvariables + self.actor.nvariables
def __init__(self, alpha=0.00005, beta=0.00005, input_dims=5, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions latent_dims = 10 self.actor = ActorNetwork_2(alpha, latent_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, latent_dims, n_actions, name='critic_det_1') self.critic_2 = CriticNetwork(beta, latent_dims, n_actions, name='critic__det_2') self.value = ValueNetwork(beta, latent_dims, name='value_det') self.target_value = ValueNetwork(beta, latent_dims, name='target_value_det') self.VAE = LinearVAE() self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, num_agents=2, obs_size=24, act_size=2, gamma=0.99, tau=1e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay_actor=1e-5, weight_decay_critic=1e-4, clip_grad=1.0): super(MADDPGAgent, self).__init__() # Write parameters self.num_agents = num_agents self.gamma = gamma self.tau = tau self.clip_grad = clip_grad # Create all the networks self.actor = ActorNetwork(obs_size, act_size).to(device) self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device) self.target_actor = ActorNetwork(obs_size, act_size).to(device) self.target_critic = CriticNetwork(num_agents, obs_size, act_size).to(device) # Copy initial network parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Initialize training optimizers and OU noise self.noise = OUNoise(act_size, scale=1.0) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], max_action=1, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=512, layer2_size=512, batch_size=512, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=max_action) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def main(args): with tf.Session() as session: np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) # initialize ROS interface agent = fake.fake_agent() plant = fake.fake_plant() state_shape = agent.get_state_shape() action_shape = agent.get_action_shape() action_bound = agent.get_action_bound() # initialize function approximators actor_network = ActorNetwork(session, state_shape, action_shape, action_bound, float(args['actor_lr']), float(args['tau']), loss_mask=True) critic_network = CriticNetwork(session, state_shape, action_shape, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor_network.get_num_trainable_vars(), loss_mask=True) predictor_network = fake.fake_predictor() latent_network = fake.fake_latent() learn(session, actor_network, critic_network, predictor_network, agent, plant, latent_network=latent_network, buffer_size=int(args['buffer_size']), batch_size=int(args['batch_size']), trace_length=int(args['trace_length']), update_freq=int(args['update_freq']), pretrain_steps=int(args['pretrain_steps']), update_steps=int(args['update_steps']), max_episodes=int(args['max_episodes']), max_ep_steps=int(args['max_episode_len']), summary_dir=args['summary_dir'])
def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, warmup=1000, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=100, noise=0.1): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='actor') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_critic_2') self.noise = noise self.update_network_parameters(tau=1)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2')
PRIORITIZED = True BATCH_SIZE = 64 RANDOM_SEED = 1234 # set up environment env = gym.make(ENV_NAME) bound = np.max(env.action_space.high) state = state_prime = env.reset() action = env.action_space.sample() a_dim = len(action) s_dim = len(state) # initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound) explore = ExploreNetwork(sess, state, action, EXP_LEARNING_RATE, TAU, bound) critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU) # initialize variables and store tensorboard graph sess.run(tf.global_variables_initializer()) summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph) summary_writer.close() # initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ actor.update_target_network() explore.update_target_network() critic.update_target_network() # initialize replay buffer replay = ReplayBuffer( BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED )
def main(): with tf.Session() as sess: actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE) critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # main loop. for ep in range(MAX_EPISODES): episode_reward = 0 ep_batch_avg_q = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) #+ actor_noise() s2, r, terminal, info = ENV.step(a[0]) #print(s2) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE and \ step % TRAIN_INTERVAL == 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) #print(grads[0].shape) #exit(1) actor.train(s_batch, grads[0]) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() ep_batch_avg_q += np.mean(pred_q) s = s2 episode_reward += r if terminal: print('Episode:', ep, 'Reward:', episode_reward) reward_log.append(episode_reward) q_log.append(ep_batch_avg_q / step) break
class Agent: def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, n_epochs=10, chkpt_dir='models/'): self.gamma = gamma self.policy_clip = policy_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.chkpt_dir = chkpt_dir self.actor = ActorNetwork(n_actions) self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic = CriticNetwork() self.critic.compile(optimizer=Adam(learning_rate=alpha)) self.memory = PPOMemory(batch_size) def store_transition(self, state, action, probs, vals, reward, done): self.memory.store_memory(state, action, probs, vals, reward, done) def save_models(self): print('... saving models ...') self.actor.save(self.chkpt_dir + 'actor') self.critic.save(self.chkpt_dir + 'critic') def load_models(self): print('... loading models ...') self.actor = keras.models.load_model(self.chkpt_dir + 'actor') self.critic = keras.models.load_model(self.chkpt_dir + 'critic') def choose_action(self, observation): state = tf.convert_to_tensor([observation]) probs = self.actor(state) dist = tfp.distributions.Categorical(probs) action = dist.sample() log_prob = dist.log_prob(action) value = self.critic(state) action = action.numpy()[0] value = value.numpy()[0] log_prob = log_prob.numpy()[0] return action, log_prob, value def learn(self): for _ in range(self.n_epochs): state_arr, action_arr, old_prob_arr, vals_arr,\ reward_arr, dones_arr, batches = \ self.memory.generate_batches() values = vals_arr advantage = np.zeros(len(reward_arr), dtype=np.float32) for t in range(len(reward_arr) - 1): discount = 1 a_t = 0 for k in range(t, len(reward_arr) - 1): a_t += discount * (reward_arr[k] + self.gamma * values[k + 1] * (1 - int(dones_arr[k])) - values[k]) discount *= self.gamma * self.gae_lambda advantage[t] = a_t for batch in batches: with tf.GradientTape(persistent=True) as tape: states = tf.convert_to_tensor(state_arr[batch]) old_probs = tf.convert_to_tensor(old_prob_arr[batch]) actions = tf.convert_to_tensor(action_arr[batch]) probs = self.actor(states) dist = tfp.distributions.Categorical(probs) new_probs = dist.log_prob(actions) critic_value = self.critic(states) critic_value = tf.squeeze(critic_value, 1) prob_ratio = tf.math.exp(new_probs - old_probs) weighted_probs = advantage[batch] * prob_ratio clipped_probs = tf.clip_by_value(prob_ratio, 1 - self.policy_clip, 1 + self.policy_clip) weighted_clipped_probs = clipped_probs * advantage[batch] actor_loss = -tf.math.minimum(weighted_probs, weighted_clipped_probs) actor_loss = tf.math.reduce_mean(actor_loss) returns = advantage[batch] + values[batch] # critic_loss = tf.math.reduce_mean(tf.math.pow( # returns-critic_value, 2)) critic_loss = keras.losses.MSE(critic_value, returns) actor_params = self.actor.trainable_variables actor_grads = tape.gradient(actor_loss, actor_params) critic_params = self.critic.trainable_variables critic_grads = tape.gradient(critic_loss, critic_params) self.actor.optimizer.apply_gradients( zip(actor_grads, actor_params)) self.critic.optimizer.apply_gradients( zip(critic_grads, critic_params)) self.memory.clear_memory()
class Agent: def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = tf.convert_to_tensor([observation]) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_value.weights for i, weight in enumerate(self.value.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_value.set_weights(weights) def save_models(self): print('... saving models ...') self.actor.save_weights(self.actor.checkpoint_file) self.critic_1.save_weights(self.critic_1.checkpoint_file) self.critic_2.save_weights(self.critic_2.checkpoint_file) self.value.save_weights(self.value.checkpoint_file) self.target_value.save_weights(self.target_value.checkpoint_file) def load_models(self): print('... loading models ...') self.actor.load_weights(self.actor.checkpoint_file) self.critic_1.load_weights(self.critic_1.checkpoint_file) self.critic_2.load_weights(self.critic_2.checkpoint_file) self.value.load_weights(self.value.checkpoint_file) self.target_value.load_weights(self.target_value.checkpoint_file) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = tf.convert_to_tensor(state, dtype=tf.float32) states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) rewards = tf.convert_to_tensor(reward, dtype=tf.float32) actions = tf.convert_to_tensor(action, dtype=tf.float32) with tf.GradientTape() as tape: value = tf.squeeze(self.value(states), 1) value_ = tf.squeeze(self.target_value(states_), 1) current_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=False) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, current_policy_actions) q2_new_policy = self.critic_2(states, current_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) value_target = critic_value - log_probs value_loss = 0.5 * keras.losses.MSE(value, value_target) value_network_gradient = tape.gradient(value_loss, self.value.trainable_variables) self.value.optimizer.apply_gradients( zip(value_network_gradient, self.value.trainable_variables)) with tf.GradientTape() as tape: # in the original paper, they reparameterize here. We don't implement # this so it's just the usual action. new_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=True) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, new_policy_actions) q2_new_policy = self.critic_2(states, new_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) actor_loss = log_probs - critic_value actor_loss = tf.math.reduce_mean(actor_loss) actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_network_gradient, self.actor.trainable_variables)) with tf.GradientTape(persistent=True) as tape: # I didn't know that these context managers shared values? q_hat = self.scale * reward + self.gamma * value_ * (1 - done) q1_old_policy = tf.squeeze(self.critic_1(state, action), 1) q2_old_policy = tf.squeeze(self.critic_2(state, action), 1) critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat) critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat) critic_1_network_gradient = tape.gradient( critic_1_loss, self.critic_1.trainable_variables) critic_2_network_gradient = tape.gradient( critic_2_loss, self.critic_2.trainable_variables) self.critic_1.optimizer.apply_gradients( zip(critic_1_network_gradient, self.critic_1.trainable_variables)) self.critic_2.optimizer.apply_gradients( zip(critic_2_network_gradient, self.critic_2.trainable_variables)) self.update_network_parameters()
class Agent(): def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, max_size=50000, fc1_dims=400, fc2_dims=300, batch_size=32): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='actor') self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='critic') self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_critic') self.update_network_parameters( tau=1) # for the first time target_actor and actor are same def choose_action(self, observation): self.actor.eval( ) # we are setting our actor network to eval mode because we have batch normalization layer # and we dont want to calculate statistics for that layer at this step state = T.tensor([observation], dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy()[0] def remember(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, states_, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(states, dtype=T.float).to(self.actor.device) states_ = T.tensor(states_, dtype=T.float).to(self.actor.device) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) target_actions = self.target_actor.forward(states_) critic_value_ = self.target_critic.forward(states_, target_actions) critic_value = self.critic.forward(states, actions) critic_value_[done] = 0.0 critic_value_ = critic_value_.view(-1) target = rewards + self.gamma * critic_value_ target = target.view(self.batch_size, 1) self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.actor.optimizer.zero_grad() actor_loss = -self.critic.forward(states, self.actor.forward(states)) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters( ) # sending tau None so that local tau variable there takes the value of class tau variable def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_state_dict = dict(target_critic_params) target_actor_state_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau * critic_state_dict[name].clone() + ( 1 - tau) * target_critic_state_dict[name].clone() for name in actor_state_dict: actor_state_dict[name] = tau * actor_state_dict[name].clone() + ( 1 - tau) * target_actor_state_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) self.target_actor.load_state_dict(actor_state_dict)
import torch as T from networks import ActorNetwork, CriticNetwork import gym_lqr if __name__ == '__main__': #env = gym.make('InvertedPendulumPyBulletEnv-v0') #env = gym.make('gym_lqr:lqr-stochastic-v0') env = gym.make('gym_lqr:lqr-v0') #env = gym.make('InvertedPendulum-v2') #print(env.action_space.shape[0]) actor = ActorNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], max_action=env.action_space.high) critic_1 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], name='critic_1') critic_2 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], name='critic_2') ActorNetwork.load_checkpoint(actor) critic_1.load_checkpoint() critic_2.load_checkpoint() # Load optimal P env.set_P(np.load('tmp/sac/optimal_P.npy')) # Create States and Actions states = np.expand_dims( np.expand_dims(np.arange(-100, 100, 5, dtype=np.float32), -1), -1) actions = np.expand_dims( np.expand_dims(np.arange(-10, 10, 0.5, dtype=np.float32), -1), -1)
class Agent: def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fc1=400, fc2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = ActorNetwork(n_actions=n_actions, name='actor') self.critic = CriticNetwork(name='critic') self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(name='target_critic') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic.compile(optimizer=Adam(learning_rate=beta)) self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=Adam(learning_rate=beta)) self.update_network_parameters(tau=1) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_actor.weights for i, weight in enumerate(self.actor.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_actor.set_weights(weights) weights = [] targets = self.target_critic.weights for i, weight in enumerate(self.critic.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_critic.set_weights(weights) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def save_models(self): print('... saving models ...') self.actor.save_weights(self.actor.checkpoint_file) self.target_actor.save_weights(self.target_actor.checkpoint_file) self.critic.save_weights(self.critic.checkpoint_file) self.target_critic.save_weights(self.target_critic.checkpoint_file) def load_models(self): print('... loading models ...') self.actor.load_weights(self.actor.checkpoint_file) self.target_actor.load_weights(self.target_actor.checkpoint_file) self.critic.load_weights(self.critic.checkpoint_file) self.target_critic.load_weights(self.target_critic.checkpoint_file) def choose_action(self, observation, evaluate=False): state = tf.convert_to_tensor([observation], dtype=tf.float32) actions = self.actor(state) if not evaluate: actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=self.noise) # note that if the env has an action > 1, we have to multiply by # max action at some point actions = tf.clip_by_value(actions, self.min_action, self.max_action) return actions[0] def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = tf.convert_to_tensor(state, dtype=tf.float32) states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) rewards = tf.convert_to_tensor(reward, dtype=tf.float32) actions = tf.convert_to_tensor(action, dtype=tf.float32) with tf.GradientTape() as tape: target_actions = self.target_actor(states_) critic_value_ = tf.squeeze( self.target_critic(states_, target_actions), 1) critic_value = tf.squeeze(self.critic(states, actions), 1) target = rewards + self.gamma * critic_value_ * (1 - done) critic_loss = keras.losses.MSE(target, critic_value) critic_network_gradient = tape.gradient( critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(critic_network_gradient, self.critic.trainable_variables)) with tf.GradientTape() as tape: new_policy_actions = self.actor(states) actor_loss = -self.critic(states, new_policy_actions) actor_loss = tf.math.reduce_mean(actor_loss) actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_network_gradient, self.actor.trainable_variables)) self.update_network_parameters()
loc=mu, scale=sigma )) return np.array(noise) # set up environment env = gym.make(ENV_NAME) bound = np.max(env.action_space.high) state = state_prime = env.reset() action = env.action_space.sample() a_dim = len(action) s_dim = len(state) # initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound) critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU) # initialize variables and store tensorboard graph sess.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph) summary_writer.close() # initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ actor.update_target_network() critic.update_target_network() # initialize replay buffer replay = ReplayBuffer( BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED )