def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.rl_method == 'td3': print("actor") self.actor = ActorNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) print(self.actor) elif self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path)
def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'td3': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path)
def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.value_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): # reuse_models이 True이고, value_network_path 값이 있으면 신경망 모델 파일을 불러온다... self.value_network.load_model(model_path=self.value_network_path)
def init_policy_network(self, shared_network=None, activation='softmax', loss='categorical_crossentropy'): if self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'cnn': self.policy_network = CNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path)
class ReinforcementLearner: __metaclass__ = abc.ABCMeta lock = threading.Lock() def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dnn', num_steps=1, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code self.chart_data = chart_data self.environment = Environment(chart_data) # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network self.policy_network = policy_network self.reuse_models = reuse_models self.critic = value_network self.actor = policy_network self.tau = 0.001 # 가시화 모듈 self.visualizer = Visualizer() # 메모리 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_value2 = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 # 로그 등 출력 경로 self.output_path = output_path # for Delayed Policy Update self._update_step = 0 self._target_update_interval = 2 def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.rl_method == 'td3': print("actor") self.actor = ActorNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) print(self.actor) elif self.net == 'dnn': self.policy_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path) def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.rl_method == 'td3': self.critic = CriticNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, num_steps=self.num_steps, activation=activation, loss=loss, lr=self.lr) elif self.net == 'dnn': self.value_network = DNN(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, shared_network=shared_network, activation=activation, loss=loss) elif self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path) def reset(self): self.sample = None self.training_data_idx = -1 # 환경 초기화 self.environment.reset() # 에이전트 초기화 self.agent.reset() # 가시화 초기화 self.visualizer.clear([0, len(self.chart_data)]) # 메모리 초기화 self.memory_sample = [] self.memory_action = [] self.memory_target_policy = [] self.memory_target_value = [] self.memory_target_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_value2 = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 초기화 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 def build_sample(self): self.environment.observe() if len(self.training_data) > self.training_data_idx + 1: self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() self.sample.extend(self.agent.get_states()) return self.sample return None @abc.abstractmethod def get_batch(self, batch_size, delayed_reward, discount_factor): pass @abc.abstractmethod def train(self, batch_size, delayed_reward, discount_factor): pass def update_networks(self, batch_size, delayed_reward, discount_factor): # 배치 학습 데이터 생성 x, policy, y_value1, y_value2, critic_target = self.get_batch( batch_size, delayed_reward, discount_factor) if len(x) > 0: loss = 0 loss += self.critic.train(x, y_value2, y_value2, critic_target) if self._update_step % self._target_update_interval == 0: # update actor loss += self.actor.train(x, policy) # update target networks self.actor.target_update() self.critic.target_update() self._update_step = self._update_step + 1 #reset 까먹지 않기 return loss return None def fit(self, delayed_reward, discount_factor, full=False): batch_size = len(self.memory_reward) if full \ else self.batch_size # 배치 학습 데이터 생성 및 신경망 갱신 if batch_size > 0: _loss = self.update_networks(batch_size, delayed_reward, discount_factor) if _loss is not None: self.loss += abs(_loss) self.learning_cnt += 1 self.memory_learning_idx.append(self.training_data_idx) self.batch_size = 0 def visualize(self, epoch_str, num_epoches, epsilon): self.memory_action = [Agent.ACTION_HOLD] \ * (self.num_steps - 1) + self.memory_action self.memory_num_stocks = [0] * (self.num_steps - 1) \ + self.memory_num_stocks if self.value_network is not None: self.memory_value = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_value if self.policy_network is not None: self.memory_policy = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_policy self.memory_pv = [self.agent.initial_balance] \ * (self.num_steps - 1) + self.memory_pv self.visualizer.plot( epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=self.memory_action, num_stocks=self.memory_num_stocks, outvals_value=self.memory_value, outvals_policy=self.memory_policy, exps=self.memory_exp_idx, learning_idxes=self.memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=self.memory_pv, ) self.visualizer.save( os.path.join(self.epoch_summary_dir, 'epoch_summary_{}.png'.format(epoch_str))) def run(self, num_epoches=100, balance=10000000, discount_factor=0.9, start_epsilon=0.5, learning=True): info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \ "DF:{discount_factor} TU:[{min_trading_unit}," \ "{max_trading_unit}] DRT:{delayed_reward_threshold}".format( code=self.stock_code, rl=self.rl_method, net=self.net, lr=self.lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold ) with self.lock: logging.info(info) # 시작 시간 time_start = time.time() # 가시화 준비 # 차트 데이터는 변하지 않으므로 미리 가시화 self.visualizer.prepare(self.environment.chart_data, info) # 가시화 결과 저장할 폴더 준비 self.epoch_summary_dir = os.path.join( self.output_path, 'epoch_summary_{}'.format(self.stock_code)) if not os.path.isdir(self.epoch_summary_dir): os.makedirs(self.epoch_summary_dir) else: for f in os.listdir(self.epoch_summary_dir): os.remove(os.path.join(self.epoch_summary_dir, f)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 max_portfolio_value = 0 epoch_win_cnt = 0 # 학습 반복 for epoch in range(num_epoches): time_start_epoch = time.time() # step 샘플을 만들기 위한 큐 q_sample = collections.deque(maxlen=self.num_steps) # 환경, 에이전트, 신경망, 가시화, 메모리 초기화 self.reset() # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = start_epsilon \ * (1. - float(epoch) / (num_epoches - 1)) self.agent.reset_exploration() else: epsilon = start_epsilon while True: # 샘플 생성 next_sample = self.build_sample() if next_sample is None: break # num_steps만큼 샘플 저장 q_sample.append(next_sample) if len(q_sample) < self.num_steps: continue # 가치, 정책 신경망 예측 pred_value = None pred_value2 = None pred_policy = None pred_target_policy = None pred_target_value = None if self.critic is not None: pred_value = self.critic.predict(list(q_sample)) pred_value2 = self.critic.predict2(list(q_sample)) if self.actor is not None: pred_policy = self.actor.predict(list(q_sample)) pred_target_policy = self.actor.target_model1_predict( list(q_sample)) # 신경망 또는 탐험에 의한 행동 결정 action, confidence, exploration = \ self.agent.decide_action(pred_value, pred_policy, epsilon) # target 값을 이용한 행동 결정 target_action, target_confidence, target_exploration = \ self.agent.decide_action(pred_target_policy, pred_target_value, epsilon) #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득 immediate_reward, delayed_reward = \ self.agent.act(action, confidence) # 행동 및 행동에 대한 결과를 기억 self.memory_sample.append(list(q_sample)) self.memory_action.append(action) self.memory_reward.append(immediate_reward) self.memory_target_action.append(target_action) self.memory_target_policy.append(pred_target_policy) if self.value_network is not None: self.memory_value.append(pred_value) self.memory_value2.append(pred_value2) if self.policy_network is not None: self.memory_policy.append(pred_policy) self.memory_pv.append(self.agent.portfolio_value) self.memory_num_stocks.append(self.agent.num_stocks) if exploration: self.memory_exp_idx.append(self.training_data_idx) # 반복에 대한 정보 갱신 self.batch_size += 1 self.itr_cnt += 1 self.exploration_cnt += 1 if exploration else 0 # 지연 보상 발생된 경우 미니 배치 학습 if learning and (delayed_reward != 0): self.fit(delayed_reward, discount_factor) # 에포크 종료 후 학습 if learning: self.fit(self.agent.profitloss, discount_factor, full=True) # 에포크 관련 정보 로그 기록 num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') time_end_epoch = time.time() elapsed_time_epoch = time_end_epoch - time_start_epoch if self.learning_cnt > 0: logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} " "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} " "#Stocks:{} PV:{:,.0f} " "LC:{} Loss:{:.6f} ET:{:.4f}".format( self.stock_code, epoch_str, num_epoches, epsilon, self.exploration_cnt, self.itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, self.agent.portfolio_value, self.learning_cnt, self.loss, elapsed_time_epoch)) # 에포크 관련 정보 가시화 self.visualize(epoch_str, num_epoches, epsilon) # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 종료 시간 time_end = time.time() elapsed_time = time_end - time_start # 학습 관련 정보 로그 기록 with self.lock: logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} " "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format( code=self.stock_code, elapsed_time=elapsed_time, max_pv=max_portfolio_value, cnt_win=epoch_win_cnt)) def save_models(self): if self.value_network is not None and \ self.value_network_path is not None: self.value_network.save_model(self.value_network_path) if self.policy_network is not None and \ self.policy_network_path is not None: self.policy_network.save_model(self.policy_network_path)