def main(): with tf.Session() as sess: actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE) critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # main loop. for ep in range(MAX_EPISODES): episode_reward = 0 ep_batch_avg_q = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) #+ actor_noise() s2, r, terminal, info = ENV.step(a[0]) #print(s2) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE and \ step % TRAIN_INTERVAL == 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) #print(grads[0].shape) #exit(1) actor.train(s_batch, grads[0]) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() ep_batch_avg_q += np.mean(pred_q) s = s2 episode_reward += r if terminal: print('Episode:', ep, 'Reward:', episode_reward) reward_log.append(episode_reward) q_log.append(ep_batch_avg_q / step) break
class DDPG(object): """Implementation of the deep deterministic policy gradient algorithm""" def __init__(self, docker_client, name='worker', port=3101, model_path='../models/ddpg', log_path='../logs/ddpg'): self.state_size = 29 self.action_size = 3 self.docker_client = docker_client self.buffer_size = 100000 self.batch_size = 32 self.gamma = 0.99 # disocunt factor self.tau = 0.001 # Target Network HyperParameters self.lra = 0.0001 # Learning rate for Actor self.lrc = 0.001 # Lerning rate for Critic seed(6486) self.explore = 100000. self.episode_count = 2000 self.max_steps = 10000 self.epsilon = 1 self.model_path = model_path self.port = port self.name = name if not os.path.exists(self.model_path): os.makedirs(self.model_path) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True tf.reset_default_graph() self.summary_writer = tf.summary.FileWriter(log_path) self.actor = ActorNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lra), self.tau) self.critic = CriticNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lrc), self.tau) self.buff = ReplayBuffer(self.buffer_size) self.saver = tf.train.Saver() self._create_summary() self.summary_histogram = tf.summary.merge_all() def _create_summary(self): with tf.name_scope('summary'): self.loss_summary_op = tf.summary.scalar('loss', self.critic.loss, collections=['loss']) self.reward_ph = tf.placeholder(shape=[ None, ], name='reward', dtype=tf.float32) self.target_q_values_ph = tf.placeholder( shape=[None, self.action_size], name='target_q_values', dtype=tf.float32) self.y_t_ph = tf.placeholder(shape=[None, self.action_size], name='target_y_t', dtype=tf.float32) tf.summary.scalar('reward', tf.reduce_mean(self.reward_ph), collections=['reward']) tf.summary.scalar('target_q_values', tf.reduce_mean(self.target_q_values_ph), collections=['reward']) tf.summary.scalar('y_t', tf.reduce_mean(self.y_t_ph), collections=['reward']) self.reward_summary_op = tf.summary.merge_all('reward') @staticmethod def addOUNoise(a, epsilon): """Adds noise from an Ornstein Uhlenbeck process to the actions""" def ou_func(x, mu, theta, sigma): return theta * (mu - x) + sigma * randn(1) a_new = np.zeros(np.shape(a)) noise = np.zeros(np.shape(a)) noise[0] = (max(epsilon, 0) * ou_func(a[0], 0.0, 0.60, 0.30)) noise[1] = (max(epsilon, 0) * ou_func(a[1], 0.5, 1.00, 0.10)) noise[2] = (max(epsilon, 0) * ou_func(a[2], -0.1, 1.00, 0.10)) a_new[0] = a[0] + noise[0] a_new[1] = a[1] + noise[1] a_new[2] = a[2] + noise[2] return a_new def train(self, track_name='', check_stuck=True): all_steps = 0 if track_name == '': env = TorcsDockerEnv(self.docker_client, self.name, self.port, training=True) else: env = TorcsDockerEnv(self.docker_client, self.name, self.port, track_name=track_name) with tf.Session(config=self.config) as sess: sess.run(tf.global_variables_initializer()) ckpt = tf.train.latest_checkpoint(self.model_path) if ckpt: print('load model weights from {}'.format(ckpt)) self.saver.restore(sess, ckpt) for i in range(self.episode_count): # collect the recent rewards recent_rewards = np.ones(1000) * 1e9 print("Episode : " + str(i) + " Replay Buffer " + str(self.buff.count())) if np.mod(i, 3) == 0: observation = env.reset(relaunch=True) else: observation = env.reset() state_t = obs_to_state(observation) total_reward = 0 for j in range(self.max_steps): loss = 0 # reduce the effect of the OU process with progess in the # algorithm self.epsilon -= 1.0 / self.explore action_t = self.actor.predict( sess, state_t.reshape(1, state_t.shape[0])) observation, reward_t, done, _ = env.step( DDPG.addOUNoise(action_t[0], self.epsilon)) state_t1 = obs_to_state(observation) # check if we need to terminate, bc the agent is stuck recent_rewards[j % 1000] = reward_t if (check_stuck and np.median(recent_rewards) < 1.0 and i / self.episode_count < 0.5): break self.buff.add(state_t, action_t[0], reward_t, state_t1, done) batch = self.buff.getBatch(self.batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = self.critic.target_predict( sess, new_states, self.actor.target_predict(sess, new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = (rewards[k] + self.gamma * target_q_values[k]) loss += self.critic.train(sess, y_t, states, actions) actions_for_grad = self.actor.predict(sess, states) grads = self.critic.gradients(sess, states, actions_for_grad) self.actor.train(sess, states, grads) self.actor.target_train(sess) self.critic.target_train(sess) all_steps += 1 if j % 50: loss_summary, reward_summary, histogram = sess.run( [ self.loss_summary_op, self.reward_summary_op, self.summary_histogram ], feed_dict={ self.critic.expected_critic: y_t, self.critic.state: states, self.actor.state: states, self.actor.target_state: states, self.critic.action: actions, self.reward_ph: rewards, self.target_q_values_ph: target_q_values, self.y_t_ph: y_t }) self.summary_writer.add_summary( loss_summary, all_steps) self.summary_writer.add_summary( reward_summary, all_steps) self.summary_writer.add_summary(histogram, all_steps) self.summary_writer.flush() total_reward += reward_t state_t = state_t1 print("Episode", i, "Step", all_steps, "Action", action_t, "Reward", reward_t, "Loss", loss) if done: break print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(all_steps)) print("") if np.mod(i, 50) == 0: self.saver.save( sess, self.model_path + '/model-{:d}.cptk'.format(i)) env.end()
batch = replay.sample_batch(BATCH_SIZE) batch_state = np.reshape(batch[0], (BATCH_SIZE, s_dim)) batch_action = np.reshape(batch[1], (BATCH_SIZE, a_dim)) batch_reward = np.reshape(batch[2], (BATCH_SIZE, 1)) batch_state_prime = np.reshape(batch[3], (BATCH_SIZE, s_dim)) batch_terminal = np.reshape(batch[4], (BATCH_SIZE, 1)) idx = batch[5] # set y = r + γQ′ q_prime = critic.q_target( batch_state_prime, actor.act_target(batch_state_prime) ) y = batch_reward + GAMMA*(q_prime * (1 - batch_terminal)) # update critic by minimizing the loss: l = (y − Q)^2 loss, _ = critic.train(batch_state, batch_action, y) # update replay memory losses if PRIORITIZED: replay.update(loss, idx) # update the actor policy using the sampled policy gradient # ∇θμ ≈ ∇aQ(s, a|θQ) * ∇θμ μ(s|θμ) actions = actor.act(batch_state) gradients = critic.policy_gradients(batch_state, actions) actor.train(batch_state, gradients[0]) # update the target networks # θQ′ ←τθQ +(1−τ)θQ′ actor.update_target_network() # θμ′ ←τθμ +(1−τ)θμ′
class ReinforcementLearner: __metaclass__ = abc.ABCMeta lock = threading.Lock() def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dnn', num_steps=1, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code self.chart_data = chart_data self.environment = Environment(chart_data) # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network self.policy_network = policy_network self.reuse_models = reuse_models self.critic = value_network self.actor = policy_network self.tau = 0.001 # 가시화 모듈 self.visualizer = Visualizer() # 메모리 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_value2 = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 # 로그 등 출력 경로 self.output_path = output_path # for Delayed Policy Update self._update_step = 0 self._target_update_interval = 2 def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.rl_method == 'td3': print("actor") self.actor = ActorNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) print(self.actor) elif self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path) def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'td3': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path) def reset(self): self.sample = None self.training_data_idx = -1 # 환경 초기화 self.environment.reset() # 에이전트 초기화 self.agent.reset() # 가시화 초기화 self.visualizer.clear([0, len(self.chart_data)]) # 메모리 초기화 self.memory_sample = [] self.memory_action = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_value2 = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 초기화 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 def build_sample(self): self.environment.observe() if len(self.training_data) > self.training_data_idx + 1: self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() self.sample.extend(self.agent.get_states()) return self.sample return None @abc.abstractmethod def get_batch(self, batch_size, delayed_reward, discount_factor): pass @abc.abstractmethod def train(self, batch_size, delayed_reward, discount_factor): pass def update_networks(self, batch_size, delayed_reward, discount_factor): # 배치 학습 데이터 생성 x, policy, y_value1, y_value2, critic_target = self.get_batch( batch_size, delayed_reward, discount_factor) if len(x) > 0: loss = 0 loss += self.critic.train(x, y_value2, y_value2, critic_target) if self._update_step % self._target_update_interval == 0: # update actor loss += self.actor.train(x, policy) # update target networks self.actor.target_update() self.critic.target_update() self._update_step = self._update_step + 1 #reset 까먹지 않기 return loss return None def fit(self, delayed_reward, discount_factor, full=False): batch_size = len(self.memory_reward) if full \ else self.batch_size # 배치 학습 데이터 생성 및 신경망 갱신 if batch_size > 0: _loss = self.update_networks(batch_size, delayed_reward, discount_factor) if _loss is not None: self.loss += abs(_loss) self.learning_cnt += 1 self.memory_learning_idx.append(self.training_data_idx) self.batch_size = 0 def visualize(self, epoch_str, num_epoches, epsilon): self.memory_action = [Agent.ACTION_HOLD] \ * (self.num_steps - 1) + self.memory_action self.memory_num_stocks = [0] * (self.num_steps - 1) \ + self.memory_num_stocks if self.value_network is not None: self.memory_value = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_value if self.policy_network is not None: self.memory_policy = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_policy self.memory_pv = [self.agent.initial_balance] \ * (self.num_steps - 1) + self.memory_pv self.visualizer.plot( epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=self.memory_action, num_stocks=self.memory_num_stocks, outvals_value=self.memory_value, outvals_policy=self.memory_policy, exps=self.memory_exp_idx, learning_idxes=self.memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=self.memory_pv, ) self.visualizer.save( os.path.join(self.epoch_summary_dir, 'epoch_summary_{}.png'.format(epoch_str))) def run(self, num_epoches=100, balance=10000000, discount_factor=0.9, start_epsilon=0.5, learning=True): info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \ "DF:{discount_factor} TU:[{min_trading_unit}," \ "{max_trading_unit}] DRT:{delayed_reward_threshold}".format( code=self.stock_code, rl=self.rl_method, net=self.net, lr=self.lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold ) with self.lock: logging.info(info) # 시작 시간 time_start = time.time() # 가시화 준비 # 차트 데이터는 변하지 않으므로 미리 가시화 self.visualizer.prepare(self.environment.chart_data, info) # 가시화 결과 저장할 폴더 준비 self.epoch_summary_dir = os.path.join( self.output_path, 'epoch_summary_{}'.format(self.stock_code)) if not os.path.isdir(self.epoch_summary_dir): os.makedirs(self.epoch_summary_dir) else: for f in os.listdir(self.epoch_summary_dir): os.remove(os.path.join(self.epoch_summary_dir, f)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 max_portfolio_value = 0 epoch_win_cnt = 0 # 학습 반복 for epoch in range(num_epoches): time_start_epoch = time.time() # step 샘플을 만들기 위한 큐 q_sample = collections.deque(maxlen=self.num_steps) # 환경, 에이전트, 신경망, 가시화, 메모리 초기화 self.reset() # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = start_epsilon \ * (1. - float(epoch) / (num_epoches - 1)) self.agent.reset_exploration() else: epsilon = start_epsilon while True: # 샘플 생성 next_sample = self.build_sample() if next_sample is None: break # num_steps만큼 샘플 저장 q_sample.append(next_sample) if len(q_sample) < self.num_steps: continue # 가치, 정책 신경망 예측 pred_value = None pred_value2 = None pred_policy = None pred_target_policy = None pred_target_value = None if self.critic is not None: pred_value = self.critic.predict(list(q_sample)) pred_value2 = self.critic.predict2(list(q_sample)) if self.actor is not None: pred_policy = self.actor.predict(list(q_sample)) pred_target_policy = self.actor.target_model1_predict( list(q_sample)) # 신경망 또는 탐험에 의한 행동 결정 action, confidence, exploration = \ self.agent.decide_action(pred_value, pred_policy, epsilon) # target 값을 이용한 행동 결정 target_action, target_confidence, target_exploration = \ self.agent.decide_action(pred_target_policy, pred_target_value, epsilon) #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득 immediate_reward, delayed_reward = \ self.agent.act(action, confidence) # 행동 및 행동에 대한 결과를 기억 self.memory_sample.append(list(q_sample)) self.memory_action.append(action) self.memory_reward.append(immediate_reward) self.memory_target_action.append(target_action) self.memory_target_policy.append(pred_target_policy) if self.value_network is not None: self.memory_value.append(pred_value) self.memory_value2.append(pred_value2) if self.policy_network is not None: self.memory_policy.append(pred_policy) self.memory_pv.append(self.agent.portfolio_value) self.memory_num_stocks.append(self.agent.num_stocks) if exploration: self.memory_exp_idx.append(self.training_data_idx) # 반복에 대한 정보 갱신 self.batch_size += 1 self.itr_cnt += 1 self.exploration_cnt += 1 if exploration else 0 # 지연 보상 발생된 경우 미니 배치 학습 if learning and (delayed_reward != 0): self.fit(delayed_reward, discount_factor) # 에포크 종료 후 학습 if learning: self.fit(self.agent.profitloss, discount_factor, full=True) # 에포크 관련 정보 로그 기록 num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') time_end_epoch = time.time() elapsed_time_epoch = time_end_epoch - time_start_epoch if self.learning_cnt > 0: logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} " "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} " "#Stocks:{} PV:{:,.0f} " "LC:{} Loss:{:.6f} ET:{:.4f}".format( self.stock_code, epoch_str, num_epoches, epsilon, self.exploration_cnt, self.itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, self.agent.portfolio_value, self.learning_cnt, self.loss, elapsed_time_epoch)) # 에포크 관련 정보 가시화 self.visualize(epoch_str, num_epoches, epsilon) # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 종료 시간 time_end = time.time() elapsed_time = time_end - time_start # 학습 관련 정보 로그 기록 with self.lock: logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} " "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format( code=self.stock_code, elapsed_time=elapsed_time, max_pv=max_portfolio_value, cnt_win=epoch_win_cnt)) def save_models(self): if self.value_network is not None and \ self.value_network_path is not None: self.value_network.save_model(self.value_network_path) if self.policy_network is not None and \ self.policy_network_path is not None: self.policy_network.save_model(self.policy_network_path)
class Agent(object): def __init__(self, alpha, beta, input_dims, action_bound, tau, env, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=400, layer2_size=300, batch_size=64): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.action_bound = action_bound self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Actor') self.critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Critic') self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor') self.target_critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to( self.actor.device) self.actor.train() return (mu_prime * T.tensor(self.action_bound)).cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.critic.device) done = T.tensor(done).to(self.critic.device) new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device) action = T.tensor(action, dtype=T.float).to(self.critic.device) state = T.tensor(state, dtype=T.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = T.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) """ #Verify that the copy assignment worked correctly target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(target_critic_params) actor_state_dict = dict(target_actor_params) print('\nActor Networks', tau) for name, param in self.actor.named_parameters(): print(name, T.equal(param, actor_state_dict[name])) print('\nCritic Networks', tau) for name, param in self.critic.named_parameters(): print(name, T.equal(param, critic_state_dict[name])) input() """ def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint() def check_actor_params(self): current_actor_params = self.actor.named_parameters() current_actor_dict = dict(current_actor_params) original_actor_dict = dict(self.original_actor.named_parameters()) original_critic_dict = dict(self.original_critic.named_parameters()) current_critic_params = self.critic.named_parameters() current_critic_dict = dict(current_critic_params) print('Checking Actor parameters') for param in current_actor_dict: print( param, T.equal(original_actor_dict[param], current_actor_dict[param])) print('Checking critic parameters') for param in current_critic_dict: print( param, T.equal(original_critic_dict[param], current_critic_dict[param])) input()
# target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. actor.train(s_batch, a_batch) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() s = s2 episode_reward += r episode_avg_max_Q += np.amax(pred_q) if terminal: print('Episode:', ep, 'Reward:', episode_reward)
class DdpgAgent: """ A Deep Deterministic Policy Gradient Agent. Interacts with and learns from the environment. """ def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize an Agent object. Params ====== num_agents (int): number of agents observed at the same time. multiple agents are handled within the class. state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ if random_seed is not None: random.seed(random_seed) np.random.seed(random_seed) self.t_step = 0 # A counter that increases each time the "step" function is executed self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = ActorNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.actor_target = ActorNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_ACTOR) # self.actor_optimizer = optim.RMSprop(self.actor_local.parameters(), lr=LR_ACTOR, # weight_decay=WEIGHT_DECAY_ACTOR) # Also solves it, but Adam quicker # Critic Network (w/ Target Network) self.critic_local = CriticNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.critic_target = CriticNetwork(state_size, action_size, USE_BATCH_NORM, random_seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS, fc3_units=FC3_UNITS).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_CRITIC) # self.critic_optimizer = optim.RMSprop(self.critic_local.parameters(), lr=LR_CRITIC, # weight_decay=WEIGHT_DECAY_CRITIC) # Also solves it, but Adam quicker # Make sure target is initiated with the same weight as the local network self.soft_update(self.actor_local, self.actor_target, 1) self.soft_update(self.critic_local, self.critic_target, 1) # Setting default modes for the networks # Target networks do not need to train, so always eval() # Local networks, in training mode, unless altered in code - eg when acting. self.actor_local.train() self.actor_target.eval() self.critic_local.train() self.critic_target.eval() # Action Noise process (encouraging exploration during training) # Could consider parameter noise in future as a potentially better alternative / addition if ACTION_NOISE_METHOD == 'initial': self.noise = InitialOrnsteinUhlenbeckActionNoise( shape=(num_agents, action_size), random_seed=random_seed, x0=0, mu=0, theta=NOISE_THETA, sigma=NOISE_SIGMA) elif ACTION_NOISE_METHOD == 'adjusted': self.noise = AdjustedOrnsteinUhlenbeckActionNoise( shape=(num_agents, action_size), random_seed=random_seed, x0=0, mu=0, sigma=NOISE_SIGMA, theta=NOISE_THETA, dt=NOISE_DT, sigma_delta=NOISE_SIGMA_DELTA, ) else: raise ValueError('Unknown action noise method: ' + ACTION_NOISE_METHOD) # Replay memory self.memory = ReplayBuffer( buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, sampling_method=REPLAY_BUFFER_SAMPLING_METHOD, random_seed=random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory, every UPDATE_EVERY steps if self.t_step % UPDATE_EVERY == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_action_noise=False): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval( ) # train state is set right before actual training with torch.no_grad( ): # All calcs here with no_grad, but many examples didn't do this. Weirdly, this is slower.. return np.clip( self.actor_local(states).cpu().data.numpy() + (self.noise.sample() if add_action_noise else 0), -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): reward discount factor """ states, actions, rewards, next_states, dones = experiences self.actor_local.train( ) # critic_local is always in train state, but actor_local goes into eval with acting # Critic # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if CLIP_GRADIENT_CRITIC: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() if CLIP_GRADIENT_ACTOR: torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # Soft-Update of Target Networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update target model parameters from local model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Learner(tf.Module): def __init__(self, logger, replay_buffer): super(Learner, self).__init__(name="Learner") self.device = Params.DEVICE with tf.device(self.device), self.name_scope: self.dtype = Params.DTYPE self.logger = logger self.batch_size = Params.MINIBATCH_SIZE self.gamma = Params.GAMMA self.tau = Params.TAU self.replay_buffer = replay_buffer self.priority_beta = tf.Variable(Params.BUFFER_PRIORITY_BETA_START) self.running = tf.Variable(True) self.n_steps = tf.Variable(0) # Init Networks self.actor = ActorNetwork(with_target_net=True) self.critic = CriticNetwork() # Save shared variables self.policy_variables = self.actor.tvariables + self.actor.nvariables def save_model(self, path): self.actor.actor_network.save(path, overwrite=True, include_optimizer=False, save_format="tf") @tf.function() def run(self): print("retracing learner run") with tf.device(self.device), self.name_scope: # Wait for replay buffer to fill tf.cond( Params.BUFFER_FROM_REVERB, lambda: [], lambda: tf.while_loop( lambda: tf.logical_and( tf.less(self.replay_buffer.size(), Params. MINIBATCH_SIZE), self.running), lambda: [], loop_vars=[], parallel_iterations=1, ), ) # Do training n_steps = tf.while_loop(lambda n_step: tf.logical_and( tf.less_equal(n_step, Params.MAX_STEPS_TRAIN), self.running), self.train_step, loop_vars=[tf.constant(1)]) # Save number of performed steps self.n_steps.assign(tf.squeeze(n_steps)) def train_step(self, n_step): print("retracing train_step") with tf.device(self.device), self.name_scope: print("Eager Execution: ", tf.executing_eagerly()) print("Eager Keras Model:", self.actor.actor_network.run_eagerly) if Params.BUFFER_FROM_REVERB: # Get batch from replay memory (s_batch, a_batch, _, _, s2_batch, target_z_atoms_batch), weights_batch, idxes_batch = \ self.replay_buffer.sample_batch(self.batch_size, self.priority_beta) else: # Get batch from replay memory (s_batch, a_batch, r_batch, t_batch, s2_batch, g_batch), weights_batch, idxes_batch = \ self.replay_buffer.sample_batch(self.batch_size, self.priority_beta) # Compute targets (bellman update) target_z_atoms_batch = tf.where(t_batch, 0., Params.Z_ATOMS) target_z_atoms_batch = r_batch + (target_z_atoms_batch * g_batch) # Predict target Q value by target critic network target_action_batch = self.actor.target_actor_network( s2_batch, training=False) target_q_probs = tf.cast( self.critic.target_critic_network( [s2_batch, target_action_batch], training=False), self.dtype) # Train the critic on given targets td_error_batch = self.critic.train( x=[s_batch, a_batch], target_z_atoms=target_z_atoms_batch, target_q_probs=target_q_probs, is_weights=weights_batch) # Compute actions for state batch actions = self.actor.actor_network(s_batch, training=False) # Compute and negate action values (to enable gradient ascent) values = self.critic.critic_network([s_batch, actions], training=False) # Compute (dq / da * da / dtheta = dq / dtheta) grads (action values grads wrt. actor network weights) action_gradients = tf.gradients(values, actions, Params.Z_ATOMS)[0] actor_gradients = tf.gradients(actions, self.actor.tvariables, -action_gradients) # Normalize grads element-wise actor_gradients = [ tf.divide(gradient, tf.cast(self.batch_size, self.dtype)) for gradient in actor_gradients ] # Apply gradients to actor net self.actor.actor_network.optimizer.apply_gradients( zip(actor_gradients, self.actor.tvariables)) # Update target networks update_weights( self.actor.target_tvariables + self.actor.target_nvariables, self.actor.tvariables + self.actor.nvariables, self.tau) update_weights( self.critic.target_tvariables + self.critic.target_nvariables, self.critic.tvariables + self.critic.nvariables, self.tau) # Use critic TD error to update priorities self.replay_buffer.update_priorities(idxes_batch, td_error_batch) # Increment beta value self.priority_beta.assign_add( Params.BUFFER_PRIORITY_BETA_INCREMENT) # Log status tf.cond( tf.equal( tf.math.mod(n_step, tf.constant(Params.LEARNER_LOG_STEPS)), tf.constant(0)), lambda: self.logger.log_step_learner( n_step, tf.cast(tf.reduce_mean(td_error_batch), Params.DTYPE), self.priority_beta), lambda: None) return tf.add(n_step, 1)