class ReinforcementLearner: __metaclass__ = abc.ABCMeta lock = threading.Lock() def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dnn', num_steps=1, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code self.chart_data = chart_data self.environment = Environment(chart_data) # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network self.policy_network = policy_network self.reuse_models = reuse_models self.critic = value_network self.actor = policy_network self.tau = 0.001 # 가시화 모듈 self.visualizer = Visualizer() # 메모리 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 # 로그 등 출력 경로 self.output_path = output_path def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.rl_method == 'ddpg': print("actor") self.actor = ActorNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) print(self.actor) elif self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path) def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'ddpg': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path) def reset(self): self.sample = None self.training_data_idx = -1 # 환경 초기화 self.environment.reset() # 에이전트 초기화 self.agent.reset() # 가시화 초기화 self.visualizer.clear([0, len(self.chart_data)]) # 메모리 초기화 self.memory_sample = [] self.memory_action = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 초기화 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 def build_sample(self): self.environment.observe() if len(self.training_data) > self.training_data_idx + 1: self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() self.sample.extend(self.agent.get_states()) return self.sample return None @abc.abstractmethod def get_batch(self, batch_size, delayed_reward, discount_factor): pass @abc.abstractmethod def train(self, batch_size, delayed_reward, discount_factor): pass def update_networks(self, batch_size, delayed_reward, discount_factor): # 배치 학습 데이터 생성 x, y_value, y_policy = self.get_batch(batch_size, delayed_reward, discount_factor) if len(x) > 0: loss = 0 if y_value is not None: # 가치 신경망 갱신 loss += self.critic.train_on_batch(x, y_value) self.critic.transfer_weights() if y_policy is not None: # 정책 신경망 갱신 loss += self.actor.train_on_batch(x, y_policy) self.actor.transfer_weights() return loss return None def fit(self, delayed_reward, discount_factor, full=False): batch_size = len(self.memory_reward) if full \ else self.batch_size # 배치 학습 데이터 생성 및 신경망 갱신 if batch_size > 0: _loss = self.update_networks(batch_size, delayed_reward, discount_factor) if _loss is not None: self.loss += abs(_loss) self.learning_cnt += 1 self.memory_learning_idx.append(self.training_data_idx) self.batch_size = 0 def visualize(self, epoch_str, num_epoches, epsilon): self.memory_action = [Agent.ACTION_HOLD] \ * (self.num_steps - 1) + self.memory_action self.memory_num_stocks = [0] * (self.num_steps - 1) \ + self.memory_num_stocks if self.value_network is not None: self.memory_value = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_value if self.policy_network is not None: self.memory_policy = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_policy self.memory_pv = [self.agent.initial_balance] \ * (self.num_steps - 1) + self.memory_pv self.visualizer.plot( epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=self.memory_action, num_stocks=self.memory_num_stocks, outvals_value=self.memory_value, outvals_policy=self.memory_policy, exps=self.memory_exp_idx, learning_idxes=self.memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=self.memory_pv, ) self.visualizer.save( os.path.join(self.epoch_summary_dir, 'epoch_summary_{}.png'.format(epoch_str))) def run(self, num_epoches=100, balance=10000000, discount_factor=0.9, start_epsilon=0.5, learning=True): info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \ "DF:{discount_factor} TU:[{min_trading_unit}," \ "{max_trading_unit}] DRT:{delayed_reward_threshold}".format( code=self.stock_code, rl=self.rl_method, net=self.net, lr=self.lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold ) with self.lock: logging.info(info) # 시작 시간 time_start = time.time() # 가시화 준비 # 차트 데이터는 변하지 않으므로 미리 가시화 self.visualizer.prepare(self.environment.chart_data, info) # 가시화 결과 저장할 폴더 준비 self.epoch_summary_dir = os.path.join( self.output_path, 'epoch_summary_{}'.format(self.stock_code)) if not os.path.isdir(self.epoch_summary_dir): os.makedirs(self.epoch_summary_dir) else: for f in os.listdir(self.epoch_summary_dir): os.remove(os.path.join(self.epoch_summary_dir, f)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 max_portfolio_value = 0 epoch_win_cnt = 0 # 학습 반복 for epoch in range(num_epoches): time_start_epoch = time.time() # step 샘플을 만들기 위한 큐 q_sample = collections.deque(maxlen=self.num_steps) # 환경, 에이전트, 신경망, 가시화, 메모리 초기화 self.reset() # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = start_epsilon \ * (1. - float(epoch) / (num_epoches - 1)) self.agent.reset_exploration() else: epsilon = start_epsilon while True: # 샘플 생성 next_sample = self.build_sample() if next_sample is None: break # num_steps만큼 샘플 저장 q_sample.append(next_sample) if len(q_sample) < self.num_steps: continue # 가치, 정책 신경망 예측 pred_value = None pred_policy = None pred_target_policy = None pred_target_value = None if self.critic is not None: pred_value = self.critic.predict(list(q_sample)) pred_target_value = self.critic.target_predict( list(q_sample)) if self.actor is not None: pred_policy = self.actor.predict(list(q_sample)) pred_target_policy = self.actor.target_predict( list(q_sample)) # 신경망 또는 탐험에 의한 행동 결정 action, confidence, exploration = \ self.agent.decide_action(pred_value, pred_policy, epsilon) # target 값을 이용한 행동 결정 target_action, target_confidence, target_exploration = \ self.agent.decide_action(pred_target_policy, pred_target_value, epsilon) #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득 immediate_reward, delayed_reward = \ self.agent.act(action, confidence) # 행동 및 행동에 대한 결과를 기억 self.memory_sample.append(list(q_sample)) self.memory_action.append(action) self.memory_reward.append(immediate_reward) self.memory_target_action.append(target_action) self.memory_target_policy.append(pred_target_policy) self.memory_target_value.append(pred_target_value) if self.value_network is not None: self.memory_value.append(pred_value) if self.policy_network is not None: self.memory_policy.append(pred_policy) self.memory_pv.append(self.agent.portfolio_value) self.memory_num_stocks.append(self.agent.num_stocks) if exploration: self.memory_exp_idx.append(self.training_data_idx) # 반복에 대한 정보 갱신 self.batch_size += 1 self.itr_cnt += 1 self.exploration_cnt += 1 if exploration else 0 # 지연 보상 발생된 경우 미니 배치 학습 if learning and (delayed_reward != 0): self.fit(delayed_reward, discount_factor) # 에포크 종료 후 학습 if learning: self.fit(self.agent.profitloss, discount_factor, full=True) # 에포크 관련 정보 로그 기록 num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') time_end_epoch = time.time() elapsed_time_epoch = time_end_epoch - time_start_epoch if self.learning_cnt > 0: logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} " "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} " "#Stocks:{} PV:{:,.0f} " "LC:{} Loss:{:.6f} ET:{:.4f}".format( self.stock_code, epoch_str, num_epoches, epsilon, self.exploration_cnt, self.itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, self.agent.portfolio_value, self.learning_cnt, self.loss, elapsed_time_epoch)) # 에포크 관련 정보 가시화 self.visualize(epoch_str, num_epoches, epsilon) # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 종료 시간 time_end = time.time() elapsed_time = time_end - time_start # 학습 관련 정보 로그 기록 with self.lock: logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} " "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format( code=self.stock_code, elapsed_time=elapsed_time, max_pv=max_portfolio_value, cnt_win=epoch_win_cnt)) def save_models(self): if self.value_network is not None and \ self.value_network_path is not None: self.value_network.save_model(self.value_network_path) if self.policy_network is not None and \ self.policy_network_path is not None: self.policy_network.save_model(self.policy_network_path)
class DDPG(object): """Implementation of the deep deterministic policy gradient algorithm""" def __init__(self, docker_client, name='worker', port=3101, model_path='../models/ddpg', log_path='../logs/ddpg'): self.state_size = 29 self.action_size = 3 self.docker_client = docker_client self.buffer_size = 100000 self.batch_size = 32 self.gamma = 0.99 # disocunt factor self.tau = 0.001 # Target Network HyperParameters self.lra = 0.0001 # Learning rate for Actor self.lrc = 0.001 # Lerning rate for Critic seed(6486) self.explore = 100000. self.episode_count = 2000 self.max_steps = 10000 self.epsilon = 1 self.model_path = model_path self.port = port self.name = name if not os.path.exists(self.model_path): os.makedirs(self.model_path) self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True tf.reset_default_graph() self.summary_writer = tf.summary.FileWriter(log_path) self.actor = ActorNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lra), self.tau) self.critic = CriticNetwork(self.state_size, self.action_size, tf.train.AdamOptimizer(self.lrc), self.tau) self.buff = ReplayBuffer(self.buffer_size) self.saver = tf.train.Saver() self._create_summary() self.summary_histogram = tf.summary.merge_all() def _create_summary(self): with tf.name_scope('summary'): self.loss_summary_op = tf.summary.scalar('loss', self.critic.loss, collections=['loss']) self.reward_ph = tf.placeholder(shape=[ None, ], name='reward', dtype=tf.float32) self.target_q_values_ph = tf.placeholder( shape=[None, self.action_size], name='target_q_values', dtype=tf.float32) self.y_t_ph = tf.placeholder(shape=[None, self.action_size], name='target_y_t', dtype=tf.float32) tf.summary.scalar('reward', tf.reduce_mean(self.reward_ph), collections=['reward']) tf.summary.scalar('target_q_values', tf.reduce_mean(self.target_q_values_ph), collections=['reward']) tf.summary.scalar('y_t', tf.reduce_mean(self.y_t_ph), collections=['reward']) self.reward_summary_op = tf.summary.merge_all('reward') @staticmethod def addOUNoise(a, epsilon): """Adds noise from an Ornstein Uhlenbeck process to the actions""" def ou_func(x, mu, theta, sigma): return theta * (mu - x) + sigma * randn(1) a_new = np.zeros(np.shape(a)) noise = np.zeros(np.shape(a)) noise[0] = (max(epsilon, 0) * ou_func(a[0], 0.0, 0.60, 0.30)) noise[1] = (max(epsilon, 0) * ou_func(a[1], 0.5, 1.00, 0.10)) noise[2] = (max(epsilon, 0) * ou_func(a[2], -0.1, 1.00, 0.10)) a_new[0] = a[0] + noise[0] a_new[1] = a[1] + noise[1] a_new[2] = a[2] + noise[2] return a_new def train(self, track_name='', check_stuck=True): all_steps = 0 if track_name == '': env = TorcsDockerEnv(self.docker_client, self.name, self.port, training=True) else: env = TorcsDockerEnv(self.docker_client, self.name, self.port, track_name=track_name) with tf.Session(config=self.config) as sess: sess.run(tf.global_variables_initializer()) ckpt = tf.train.latest_checkpoint(self.model_path) if ckpt: print('load model weights from {}'.format(ckpt)) self.saver.restore(sess, ckpt) for i in range(self.episode_count): # collect the recent rewards recent_rewards = np.ones(1000) * 1e9 print("Episode : " + str(i) + " Replay Buffer " + str(self.buff.count())) if np.mod(i, 3) == 0: observation = env.reset(relaunch=True) else: observation = env.reset() state_t = obs_to_state(observation) total_reward = 0 for j in range(self.max_steps): loss = 0 # reduce the effect of the OU process with progess in the # algorithm self.epsilon -= 1.0 / self.explore action_t = self.actor.predict( sess, state_t.reshape(1, state_t.shape[0])) observation, reward_t, done, _ = env.step( DDPG.addOUNoise(action_t[0], self.epsilon)) state_t1 = obs_to_state(observation) # check if we need to terminate, bc the agent is stuck recent_rewards[j % 1000] = reward_t if (check_stuck and np.median(recent_rewards) < 1.0 and i / self.episode_count < 0.5): break self.buff.add(state_t, action_t[0], reward_t, state_t1, done) batch = self.buff.getBatch(self.batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = self.critic.target_predict( sess, new_states, self.actor.target_predict(sess, new_states)) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = (rewards[k] + self.gamma * target_q_values[k]) loss += self.critic.train(sess, y_t, states, actions) actions_for_grad = self.actor.predict(sess, states) grads = self.critic.gradients(sess, states, actions_for_grad) self.actor.train(sess, states, grads) self.actor.target_train(sess) self.critic.target_train(sess) all_steps += 1 if j % 50: loss_summary, reward_summary, histogram = sess.run( [ self.loss_summary_op, self.reward_summary_op, self.summary_histogram ], feed_dict={ self.critic.expected_critic: y_t, self.critic.state: states, self.actor.state: states, self.actor.target_state: states, self.critic.action: actions, self.reward_ph: rewards, self.target_q_values_ph: target_q_values, self.y_t_ph: y_t }) self.summary_writer.add_summary( loss_summary, all_steps) self.summary_writer.add_summary( reward_summary, all_steps) self.summary_writer.add_summary(histogram, all_steps) self.summary_writer.flush() total_reward += reward_t state_t = state_t1 print("Episode", i, "Step", all_steps, "Action", action_t, "Reward", reward_t, "Loss", loss) if done: break print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward)) print("Total Step: " + str(all_steps)) print("") if np.mod(i, 50) == 0: self.saver.save( sess, self.model_path + '/model-{:d}.cptk'.format(i)) env.end()