def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fc1=400, fc2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = ActorNetwork(n_actions=n_actions, name='actor') self.critic = CriticNetwork(name='critic') self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(name='target_critic') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic.compile(optimizer=Adam(learning_rate=beta)) self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=Adam(learning_rate=beta)) self.update_network_parameters(tau=1)
def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta= 0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, ent_alpha = 0.0001, batch_size=256, reward_scale=2, layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.ent_alpha = ent_alpha self.reward_scale = reward_scale self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size , name='actor', chkpt_dir=chkpt_dir) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_1', chkpt_dir=chkpt_dir) self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_2', chkpt_dir=chkpt_dir) self.target_critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_1', chkpt_dir=chkpt_dir) self.target_critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_2', chkpt_dir=chkpt_dir) self.update_network_parameters(tau=1)
def __init__(self, actor_dims, critic_dims, n_actions, n_agents, agent_idx, chkpt_dir, alpha=0.01, beta=0.01, fc1=64, fc2=64, gamma=0.95, tau=0.01): self.gamma = gamma self.tau = tau self.n_actions = n_actions self.agent_name = 'agent_%s' % agent_idx self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_actor') self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_critic') self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_actor') self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_critic') self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, max_size=1000000, fc1_dims=400, fc2_dims=300, batch_size=64): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='actor') self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='critic') self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions=n_actions, name='target_critic') self.update_network_parameters(tau=1)
def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'td3': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path)
def main(_): ''' previous = tf.train.import_meta_graph(SAVE_DIR + '/model.ckpt.meta') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) previous.restore(sess,tf.train.latest_checkpoint(SAVE_DIR+'/')) last_vars = tf.trainable_variables() data = sess.run(last_vars) print('Model Restored') ''' tf.reset_default_graph() with tf.Session() as sess: env = Preon_env(opt.env_params) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) state_dim = 9 action_dim = 3 goal_dim = 2 actor = ActorNetwork(sess, state_dim, action_dim, goal_dim, ACTOR_LEARNING_RATE, TAU, opt.env_params) critic = CriticNetwork(sess, state_dim, action_dim, goal_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), opt.env_params) if opt.train: train(sess, env, actor, critic, action_dim, goal_dim, state_dim) else: test(sess, env, actor, critic, action_dim, goal_dim, state_dim, opt.test_goal)
def __init__(self, alpha, beta, input_dims, action_bound, tau, env, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=64): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.action_bound = action_bound self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Actor') self.critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Critic') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor') self.target_critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \ n_actions=2, max_size=1000000, layer1_size=400, \ layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_actor', max_action=max_action, chkpt_dir=path_dir) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_1', chkpt_dir=path_dir) self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_2', chkpt_dir=path_dir) self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_value', chkpt_dir=path_dir) self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_target_value', chkpt_dir=path_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, state_size: int, action_size: int, gamma: float = 0.99, lr_actor: float = 0.001, lr_critic: float = 0.003, weight_decay: float = 0.0001, tau: float = 0.001, buffer_size: int = 100000, batch_size: int = 64): """ :param state_size: how many states does the agent get as input (input size of neural networks) :param action_size: from how many actions can the agent choose :param gamma: discount factor :param lr_actor: learning rate of the actor network :param lr_critic: learning rate of the critic network :param weight_decay: :param tau: soft update parameter :param buffer_size: size of replay buffer :param batch_size: size of learning batch (mini-batch) """ self.tau = tau self.gamma = gamma self.batch_size = batch_size self.actor_local = ActorNetwork(state_size, action_size).to(device) self.actor_target = ActorNetwork(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) print(self.actor_local) self.critic_local = CriticNetwork(state_size, action_size).to(device) self.critic_target = CriticNetwork(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) print(self.critic_local) self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.memory = ReplayBuffer(action_size, buffer_size, batch_size) # this would probably also work with Gaussian noise instead of Ornstein-Uhlenbeck process self.noise = OUNoise(action_size)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) #sets the parameters of Target-network equals to the
def __init__(self, alpha=0.00005, beta=0.00005, input_dims=5, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions latent_dims = 10 self.actor = ActorNetwork_2(alpha, latent_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, latent_dims, n_actions, name='critic_det_1') self.critic_2 = CriticNetwork(beta, latent_dims, n_actions, name='critic__det_2') self.value = ValueNetwork(beta, latent_dims, name='value_det') self.target_value = ValueNetwork(beta, latent_dims, name='target_value_det') self.VAE = LinearVAE() self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, warmup=1000, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=100, noise=0.1): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='actor') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_critic_2') self.noise = noise self.update_network_parameters(tau=1)
def __init__(self, num_agents=2, obs_size=24, act_size=2, gamma=0.99, tau=1e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay_actor=1e-5, weight_decay_critic=1e-4, clip_grad=1.0): super(MADDPGAgent, self).__init__() # Write parameters self.num_agents = num_agents self.gamma = gamma self.tau = tau self.clip_grad = clip_grad # Create all the networks self.actor = ActorNetwork(obs_size, act_size).to(device) self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device) self.target_actor = ActorNetwork(obs_size, act_size).to(device) self.target_critic = CriticNetwork(num_agents, obs_size, act_size).to(device) # Copy initial network parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Initialize training optimizers and OU noise self.noise = OUNoise(act_size, scale=1.0) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2')
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], max_action=1, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=512, layer2_size=512, batch_size=512, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=max_action) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def main(args): with tf.Session() as session: np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) # initialize ROS interface agent = fake.fake_agent() plant = fake.fake_plant() state_shape = agent.get_state_shape() action_shape = agent.get_action_shape() action_bound = agent.get_action_bound() # initialize function approximators actor_network = ActorNetwork(session, state_shape, action_shape, action_bound, float(args['actor_lr']), float(args['tau']), loss_mask=True) critic_network = CriticNetwork(session, state_shape, action_shape, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor_network.get_num_trainable_vars(), loss_mask=True) predictor_network = fake.fake_predictor() latent_network = fake.fake_latent() learn(session, actor_network, critic_network, predictor_network, agent, plant, latent_network=latent_network, buffer_size=int(args['buffer_size']), batch_size=int(args['batch_size']), trace_length=int(args['trace_length']), update_freq=int(args['update_freq']), pretrain_steps=int(args['pretrain_steps']), update_steps=int(args['update_steps']), max_episodes=int(args['max_episodes']), max_ep_steps=int(args['max_episode_len']), summary_dir=args['summary_dir'])
def __init__(self, docker_client, name='worker', port=3101, model_path='../models/ddpg', log_path='../logs/ddpg'): self.state_size = 29 self.action_size = 3 self.docker_client = docker_client self.buffer_size = 100000 self.batch_size = 32 self.gamma = 0.99 # disocunt factor self.tau = 0.001 # Target Network HyperParameters self.lra = 0.0001 # Learning rate for Actor self.lrc = 0.001 # Lerning rate for Critic seed(6486) self.explore = 100000. self.episode_count = 2000 self.max_steps = 10000 self.epsilon = 1 self.model_path = model_path self.port = port self.name = name if not os.path.exists(self.model_path): os.makedirs(self.model_path) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True tf.reset_default_graph() self.summary_writer = tf.summary.FileWriter(log_path) self.actor = ActorNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lra), self.tau) self.critic = CriticNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lrc), self.tau) self.buff = ReplayBuffer(self.buffer_size) self.saver = tf.train.Saver() self._create_summary() self.summary_histogram = tf.summary.merge_all()
def __init__(self, n_actions, n_states, obs_shape, gamma=0.99, lr=0.0003, gae_lambda=0.95, entropy_coeff=0.0005, ppo_clip=0.2, mini_batch_size=64, n_epochs=10, clip_value_loss=True, normalize_observation=False, stop_normalize_obs_after_timesteps=50000, fc1=64, fc2=64, environment='None', run=0): self.entropy_coeff = entropy_coeff self.clip_value_loss = clip_value_loss self.gamma = gamma self.ppo_clip = ppo_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.normalize_observation = normalize_observation self.stop_obs_timesteps = stop_normalize_obs_after_timesteps self.timestep = 0 self.actor = ActorNetwork(n_states=n_states, n_actions=n_actions, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.critic = CriticNetwork(n_states=n_states, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.memory = PPOMemory(mini_batch_size, gamma, gae_lambda) self.running_stats = RunningStats(shape_states=obs_shape, chkpt_dir=environment, run=run)
def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, n_epochs=10, chkpt_dir='models/'): self.gamma = gamma self.policy_clip = policy_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.chkpt_dir = chkpt_dir self.actor = ActorNetwork(n_actions) self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic = CriticNetwork() self.critic.compile(optimizer=Adam(learning_rate=alpha)) self.memory = PPOMemory(batch_size)
def __init__(self, logger, replay_buffer): super(Learner, self).__init__(name="Learner") self.device = Params.DEVICE with tf.device(self.device), self.name_scope: self.dtype = Params.DTYPE self.logger = logger self.batch_size = Params.MINIBATCH_SIZE self.gamma = Params.GAMMA self.tau = Params.TAU self.replay_buffer = replay_buffer self.priority_beta = tf.Variable(Params.BUFFER_PRIORITY_BETA_START) self.running = tf.Variable(True) self.n_steps = tf.Variable(0) # Init Networks self.actor = ActorNetwork(with_target_net=True) self.critic = CriticNetwork() # Save shared variables self.policy_variables = self.actor.tvariables + self.actor.nvariables
import torch as T from networks import ActorNetwork, CriticNetwork import gym_lqr if __name__ == '__main__': #env = gym.make('InvertedPendulumPyBulletEnv-v0') #env = gym.make('gym_lqr:lqr-stochastic-v0') env = gym.make('gym_lqr:lqr-v0') #env = gym.make('InvertedPendulum-v2') #print(env.action_space.shape[0]) actor = ActorNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], max_action=env.action_space.high) critic_1 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], name='critic_1') critic_2 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], name='critic_2') ActorNetwork.load_checkpoint(actor) critic_1.load_checkpoint() critic_2.load_checkpoint() # Load optimal P env.set_P(np.load('tmp/sac/optimal_P.npy')) # Create States and Actions states = np.expand_dims( np.expand_dims(np.arange(-100, 100, 5, dtype=np.float32), -1), -1) actions = np.expand_dims( np.expand_dims(np.arange(-10, 10, 0.5, dtype=np.float32), -1), -1)
def __init__(self, actor_dims, critic_dims, n_actions, n_agents, agent_idx, chkpt_dir, alpha=0.01, beta=0.01, fc1=64, fc2=64, gamma=0.95, tau=0.01): """ Args: actor_dims: critic_dims: n_actions: number of actions n_agents: agent_idx: agent index chkpt_dir: checkpoint directory alpha: learning rate beta: learning rate fc1: fc2: gamma: discount factor tau: soft update parameter """ self.gamma = gamma self.tau = tau self.n_actions = n_actions self.agent_name = 'agent_%s' % agent_idx # e.g., name = agent_1_actor self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_actor') self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_critic') self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_actor') self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_critic') # initially target networks and networks have the same parameters self.update_network_parameters(tau=1)
epochs=50, is_target=True, coin_boundary=COIN_BOUNDARY) actor_trainer = ActorNetwork(sess=sess, batch_size=BATCH_SIZE, batch_norm=BATCH_NORM, dropout=0.5, history_length=50, datacontainer=tc, epochs=50, is_target=False, coin_boundary=COIN_BOUNDARY) critic_target = CriticNetwork(sess=sess, batch_size=BATCH_SIZE, batch_norm=BATCH_NORM, dropout=0.5, history_length=50, datacontainer=tc, epochs=50, is_target=True) critic_trainer = CriticNetwork(sess=sess, batch_size=BATCH_SIZE, batch_norm=BATCH_NORM, dropout=0.5, history_length=50, datacontainer=tc, epochs=50, is_target=False) rpb = ReplayBuffer(buffer_size=BUFFER_SIZE) dpg = DDPG(sess=sess, batch_size=BATCH_SIZE, num_episodes=NUM_EPISODES,
def main(): with tf.Session() as sess: actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE) critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # main loop. for ep in range(MAX_EPISODES): episode_reward = 0 ep_batch_avg_q = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) #+ actor_noise() s2, r, terminal, info = ENV.step(a[0]) #print(s2) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE and \ step % TRAIN_INTERVAL == 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) #print(grads[0].shape) #exit(1) actor.train(s_batch, grads[0]) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() ep_batch_avg_q += np.mean(pred_q) s = s2 episode_reward += r if terminal: print('Episode:', ep, 'Reward:', episode_reward) reward_log.append(episode_reward) q_log.append(ep_batch_avg_q / step) break
state_boundary=boundary, state_dimension=state_dim, action_dimension=action_dim) actor_target = ActorNetwork(sess=sess, batch_size=BATCH_SIZE, batch_norm=BATCH_NORM, learning_rate=LEARNING_RATE, dropout=DROPOUT, is_target=True, state_boundary=boundary, state_dimension=state_dim, action_dimension=action_dim) critic_trainer = CriticNetwork(sess=sess, batch_size=BATCH_SIZE, batch_norm=BATCH_NORM, learning_rate=LEARNING_RATE, dropout=DROPOUT, is_target=False, state_dimension=state_dim, action_dimension=action_dim) critic_target = CriticNetwork(sess=sess, batch_size=BATCH_SIZE, batch_norm=BATCH_NORM, learning_rate=LEARNING_RATE, dropout=DROPOUT, is_target=True, state_dimension=state_dim, action_dimension=action_dim) ddpg = DDPG(sess=sess, batch_size=BATCH_SIZE, num_episodes=NUM_EPISODES, episode_length=EPISODE_LENGTH,
import networks from networks import ActorNetwork, CriticNetwork from replay_buffer import ReplayBuffer MINIBATCH_SIZE = 64 GAMMA = 0.99 if __name__ == '__main__': env = gym.make('Pendulum-v0') max_steps = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high[0] actor = ActorNetwork(state_dim, action_dim, action_bound) critic = CriticNetwork(state_dim, action_dim) replay_buffer = ReplayBuffer(10000) total = 0 for episode in range(1000): obs0 = env.reset() ep_reward = 0 for t in range(max_steps): if episode % 25 == 0: env.render() action = actor.act(obs0) # TODO add noise for exploration obs1, reward, done, info = env.step(action) replay_buffer.add(obs0.reshape(state_dim), action.reshape(action_dim), reward, t, obs1.reshape(state_dim))