def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta= 0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, ent_alpha = 0.0001, batch_size=256, reward_scale=2, layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.ent_alpha = ent_alpha self.reward_scale = reward_scale self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size , name='actor', chkpt_dir=chkpt_dir) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_1', chkpt_dir=chkpt_dir) self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_2', chkpt_dir=chkpt_dir) self.target_critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_1', chkpt_dir=chkpt_dir) self.target_critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_2', chkpt_dir=chkpt_dir) self.update_network_parameters(tau=1)
def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.rl_method == 'td3': print("actor") self.actor = ActorNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) print(self.actor) elif self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path)
def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, actor_dims, critic_dims, n_actions, n_agents, agent_idx, chkpt_dir, alpha=0.01, beta=0.01, fc1=64, fc2=64, gamma=0.95, tau=0.01): self.gamma = gamma self.tau = tau self.n_actions = n_actions self.agent_name = 'agent_%s' % agent_idx self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_actor') self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_critic') self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_actor') self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions, chkpt_dir=chkpt_dir, name=self.agent_name + '_target_critic') self.update_network_parameters(tau=1)
def main(args): with tf.Session() as session: np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) # initialize ROS interface agent = fake.fake_agent() plant = fake.fake_plant() state_shape = agent.get_state_shape() action_shape = agent.get_action_shape() action_bound = agent.get_action_bound() # initialize function approximators actor_network = ActorNetwork(session, state_shape, action_shape, action_bound, float(args['actor_lr']), float(args['tau']), loss_mask=True) critic_network = CriticNetwork(session, state_shape, action_shape, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor_network.get_num_trainable_vars(), loss_mask=True) predictor_network = fake.fake_predictor() latent_network = fake.fake_latent() learn(session, actor_network, critic_network, predictor_network, agent, plant, latent_network=latent_network, buffer_size=int(args['buffer_size']), batch_size=int(args['batch_size']), trace_length=int(args['trace_length']), update_freq=int(args['update_freq']), pretrain_steps=int(args['pretrain_steps']), update_steps=int(args['update_steps']), max_episodes=int(args['max_episodes']), max_ep_steps=int(args['max_episode_len']), summary_dir=args['summary_dir'])
def __init__(self, docker_client, name='worker', port=3101, model_path='../models/ddpg', log_path='../logs/ddpg'): self.state_size = 29 self.action_size = 3 self.docker_client = docker_client self.buffer_size = 100000 self.batch_size = 32 self.gamma = 0.99 # disocunt factor self.tau = 0.001 # Target Network HyperParameters self.lra = 0.0001 # Learning rate for Actor self.lrc = 0.001 # Lerning rate for Critic seed(6486) self.explore = 100000. self.episode_count = 2000 self.max_steps = 10000 self.epsilon = 1 self.model_path = model_path self.port = port self.name = name if not os.path.exists(self.model_path): os.makedirs(self.model_path) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True tf.reset_default_graph() self.summary_writer = tf.summary.FileWriter(log_path) self.actor = ActorNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lra), self.tau) self.critic = CriticNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lrc), self.tau) self.buff = ReplayBuffer(self.buffer_size) self.saver = tf.train.Saver() self._create_summary() self.summary_histogram = tf.summary.merge_all()
def __init__(self, alpha, beta, input_dims, action_bound, tau, env, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=64): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.action_bound = action_bound self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Actor') self.critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Critic') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor') self.target_critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \ n_actions=2, max_size=1000000, layer1_size=400, \ layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_actor', max_action=max_action, chkpt_dir=path_dir) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_1', chkpt_dir=path_dir) self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_2', chkpt_dir=path_dir) self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_value', chkpt_dir=path_dir) self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_target_value', chkpt_dir=path_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, n_actions, n_states, obs_shape, gamma=0.99, lr=0.0003, gae_lambda=0.95, entropy_coeff=0.0005, ppo_clip=0.2, mini_batch_size=64, n_epochs=10, clip_value_loss=True, normalize_observation=False, stop_normalize_obs_after_timesteps=50000, fc1=64, fc2=64, environment='None', run=0): self.entropy_coeff = entropy_coeff self.clip_value_loss = clip_value_loss self.gamma = gamma self.ppo_clip = ppo_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.normalize_observation = normalize_observation self.stop_obs_timesteps = stop_normalize_obs_after_timesteps self.timestep = 0 self.actor = ActorNetwork(n_states=n_states, n_actions=n_actions, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.critic = CriticNetwork(n_states=n_states, lr=lr, fc1_dims=fc1, fc2_dims=fc2, chkpt_dir=environment, run=run) self.memory = PPOMemory(mini_batch_size, gamma, gae_lambda) self.running_stats = RunningStats(shape_states=obs_shape, chkpt_dir=environment, run=run)
def __init__(self, state_size: int, action_size: int, gamma: float = 0.99, lr_actor: float = 0.001, lr_critic: float = 0.003, weight_decay: float = 0.0001, tau: float = 0.001, buffer_size: int = 100000, batch_size: int = 64): """ :param state_size: how many states does the agent get as input (input size of neural networks) :param action_size: from how many actions can the agent choose :param gamma: discount factor :param lr_actor: learning rate of the actor network :param lr_critic: learning rate of the critic network :param weight_decay: :param tau: soft update parameter :param buffer_size: size of replay buffer :param batch_size: size of learning batch (mini-batch) """ self.tau = tau self.gamma = gamma self.batch_size = batch_size self.actor_local = ActorNetwork(state_size, action_size).to(device) self.actor_target = ActorNetwork(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) print(self.actor_local) self.critic_local = CriticNetwork(state_size, action_size).to(device) self.critic_target = CriticNetwork(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) print(self.critic_local) self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.memory = ReplayBuffer(action_size, buffer_size, batch_size) # this would probably also work with Gaussian noise instead of Ornstein-Uhlenbeck process self.noise = OUNoise(action_size)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) #sets the parameters of Target-network equals to the
def __init__(self, logger, replay_buffer): super(Learner, self).__init__(name="Learner") self.device = Params.DEVICE with tf.device(self.device), self.name_scope: self.dtype = Params.DTYPE self.logger = logger self.batch_size = Params.MINIBATCH_SIZE self.gamma = Params.GAMMA self.tau = Params.TAU self.replay_buffer = replay_buffer self.priority_beta = tf.Variable(Params.BUFFER_PRIORITY_BETA_START) self.running = tf.Variable(True) self.n_steps = tf.Variable(0) # Init Networks self.actor = ActorNetwork(with_target_net=True) self.critic = CriticNetwork() # Save shared variables self.policy_variables = self.actor.tvariables + self.actor.nvariables
def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, update_actor_interval=2, warmup=1000, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=100, noise=0.1): self.gamma = gamma self.tau = tau self.max_action = env.action_space.high self.min_action = env.action_space.low self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.n_actions = n_actions self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='actor') self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='critic_2') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_actor') self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_critic_1') self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='target_critic_2') self.noise = noise self.update_network_parameters(tau=1)
def __init__(self, num_agents=2, obs_size=24, act_size=2, gamma=0.99, tau=1e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay_actor=1e-5, weight_decay_critic=1e-4, clip_grad=1.0): super(MADDPGAgent, self).__init__() # Write parameters self.num_agents = num_agents self.gamma = gamma self.tau = tau self.clip_grad = clip_grad # Create all the networks self.actor = ActorNetwork(obs_size, act_size).to(device) self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device) self.target_actor = ActorNetwork(obs_size, act_size).to(device) self.target_critic = CriticNetwork(num_agents, obs_size, act_size).to(device) # Copy initial network parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Initialize training optimizers and OU noise self.noise = OUNoise(act_size, scale=1.0) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic)
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2')
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], max_action=1, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=512, layer2_size=512, batch_size=512, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=max_action) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
(upper-mu)/sigma, loc=mu, scale=sigma )) return np.array(noise) # set up environment env = gym.make(ENV_NAME) bound = np.max(env.action_space.high) state = state_prime = env.reset() action = env.action_space.sample() a_dim = len(action) s_dim = len(state) # initialize critic network Q(s, a|θQ) and actor μ(s|θμ) with weights θQ and θμ actor = ActorNetwork(sess, state, action, ACTOR_LEARNING_RATE, TAU, bound) critic = CriticNetwork(sess, state, action, CRITIC_LEARNING_RATE, TAU) # initialize variables and store tensorboard graph sess.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter("./tf_logs", graph=sess.graph) summary_writer.close() # initialize target network Q′ and μ′ with weights θQ′ ← θQ, θμ′ ← θμ actor.update_target_network() critic.update_target_network() # initialize replay buffer replay = ReplayBuffer( BUFFER_SIZE, random_seed=RANDOM_SEED, prioritized=PRIORITIZED )
import gym import numpy as np import torch as T from networks import ActorNetwork, CriticNetwork import gym_lqr if __name__ == '__main__': #env = gym.make('InvertedPendulumPyBulletEnv-v0') #env = gym.make('gym_lqr:lqr-stochastic-v0') env = gym.make('gym_lqr:lqr-v0') #env = gym.make('InvertedPendulum-v2') #print(env.action_space.shape[0]) actor = ActorNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], max_action=env.action_space.high) critic_1 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], name='critic_1') critic_2 = CriticNetwork(0.0003, input_dims=env.observation_space.shape, \ n_actions=env.action_space.shape[0], name='critic_2') ActorNetwork.load_checkpoint(actor) critic_1.load_checkpoint() critic_2.load_checkpoint() # Load optimal P env.set_P(np.load('tmp/sac/optimal_P.npy')) # Create States and Actions states = np.expand_dims(
class Agent: def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = tf.convert_to_tensor([observation]) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_value.weights for i, weight in enumerate(self.value.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_value.set_weights(weights) def save_models(self): print('... saving models ...') self.actor.save_weights(self.actor.checkpoint_file) self.critic_1.save_weights(self.critic_1.checkpoint_file) self.critic_2.save_weights(self.critic_2.checkpoint_file) self.value.save_weights(self.value.checkpoint_file) self.target_value.save_weights(self.target_value.checkpoint_file) def load_models(self): print('... loading models ...') self.actor.load_weights(self.actor.checkpoint_file) self.critic_1.load_weights(self.critic_1.checkpoint_file) self.critic_2.load_weights(self.critic_2.checkpoint_file) self.value.load_weights(self.value.checkpoint_file) self.target_value.load_weights(self.target_value.checkpoint_file) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = tf.convert_to_tensor(state, dtype=tf.float32) states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) rewards = tf.convert_to_tensor(reward, dtype=tf.float32) actions = tf.convert_to_tensor(action, dtype=tf.float32) with tf.GradientTape() as tape: value = tf.squeeze(self.value(states), 1) value_ = tf.squeeze(self.target_value(states_), 1) current_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=False) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, current_policy_actions) q2_new_policy = self.critic_2(states, current_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) value_target = critic_value - log_probs value_loss = 0.5 * keras.losses.MSE(value, value_target) value_network_gradient = tape.gradient(value_loss, self.value.trainable_variables) self.value.optimizer.apply_gradients( zip(value_network_gradient, self.value.trainable_variables)) with tf.GradientTape() as tape: # in the original paper, they reparameterize here. We don't implement # this so it's just the usual action. new_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=True) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, new_policy_actions) q2_new_policy = self.critic_2(states, new_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) actor_loss = log_probs - critic_value actor_loss = tf.math.reduce_mean(actor_loss) actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_network_gradient, self.actor.trainable_variables)) with tf.GradientTape(persistent=True) as tape: # I didn't know that these context managers shared values? q_hat = self.scale * reward + self.gamma * value_ * (1 - done) q1_old_policy = tf.squeeze(self.critic_1(state, action), 1) q2_old_policy = tf.squeeze(self.critic_2(state, action), 1) critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat) critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat) critic_1_network_gradient = tape.gradient( critic_1_loss, self.critic_1.trainable_variables) critic_2_network_gradient = tape.gradient( critic_2_loss, self.critic_2.trainable_variables) self.critic_1.optimizer.apply_gradients( zip(critic_1_network_gradient, self.critic_1.trainable_variables)) self.critic_2.optimizer.apply_gradients( zip(critic_2_network_gradient, self.critic_2.trainable_variables)) self.update_network_parameters()
def train(self): epsilon = 1.00 epsiode_rewards = [] for episode in range(1, self.num_episodes + 1): state, reward = self.tsm.initialize() rewards = [] for _ in tqdm(range(self.tsm.episode_length)): if random.random() < epsilon: action = self.random_action() else: action = self.actor_trainer.select_action( inputs=np.array([state.features]))[0][0] trans_state, reward = self.tsm.step(action) rewards.append(reward) self.rpb.store(old_state=state, action=action, reward=reward, new_state=trans_state) if self.rpb.ready(self.batch_size): transitions = self.rpb.sample(batch_size=self.batch_size, recurrent=False) batch_states = [] # [batch_size, num_assets, num_features] batch_actions = [] # [batch_size, 1] batch_y = [] # [batch_size, 1] for transition in transitions: old_state, action, reward, new_state = transition target_action = self.actor_target.select_action( inputs=np.array([new_state.features])) target_q = self.critic_target.get_q_value( inputs=np.array([new_state.features]), actions=target_action)[0] y = reward + self.gamma * target_q #print("Y:", y) #print("Y:", y, "Target_q:", target_q, "Target_action:", target_action, "reward:", reward) batch_y.append(y) batch_states.append(old_state.features) batch_actions.append([action]) self.critic_trainer.train_step( inputs=np.array(batch_states), actions=np.array(batch_actions), predicted_q_value=np.array(batch_y)) policy_actions = self.actor_trainer.select_action( inputs=np.array( batch_states)) # [batch_size, num_assets] action_grads = self.critic_trainer.get_action_gradients( inputs=np.array(batch_states), actions=policy_actions)[0] self.actor_trainer.train_step( inputs=np.array(batch_states), action_gradient=np.array(action_grads)) ActorNetwork.update_actor(self.sess, self.tau) CriticNetwork.update_critic(self.sess, self.tau) state = trans_state epsiode_rewards.append(np.sum(rewards)) if epsilon > 0.1: epsilon -= 2.0 / self.num_episodes if (episode % 1) == 0: self.infer(train=False, episode=episode) plt.plot(epsiode_rewards) plt.savefig("./episode_rewards.png") self.infer(train=False, episode=episode)