Ejemplo n.º 1
0
class ReinforcementLearner:
    __metaclass__ = abc.ABCMeta
    lock = threading.Lock()

    def __init__(self,
                 rl_method='rl',
                 stock_code=None,
                 chart_data=None,
                 training_data=None,
                 min_trading_unit=1,
                 max_trading_unit=2,
                 delayed_reward_threshold=.05,
                 net='dnn',
                 num_steps=1,
                 lr=0.001,
                 value_network=None,
                 policy_network=None,
                 output_path='',
                 reuse_models=True):
        # 인자 확인
        assert min_trading_unit > 0
        assert max_trading_unit > 0
        assert max_trading_unit >= min_trading_unit
        assert num_steps > 0
        assert lr > 0
        # 강화학습 기법 설정
        self.rl_method = rl_method
        # 환경 설정
        self.stock_code = stock_code
        self.chart_data = chart_data
        self.environment = Environment(chart_data)
        # 에이전트 설정
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)
        # 학습 데이터
        self.training_data = training_data
        self.sample = None
        self.training_data_idx = -1
        # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기
        self.num_features = self.agent.STATE_DIM
        if self.training_data is not None:
            self.num_features += self.training_data.shape[1]
        # 신경망 설정
        self.net = net
        self.num_steps = num_steps
        self.lr = lr
        self.value_network = value_network
        self.policy_network = policy_network
        self.reuse_models = reuse_models
        self.critic = value_network
        self.actor = policy_network
        self.tau = 0.001
        # 가시화 모듈
        self.visualizer = Visualizer()
        # 메모리
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_target_policy = []
        self.memory_target_value = []
        self.memory_target_action = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0
        # 로그 등 출력 경로
        self.output_path = output_path

    def init_policy_network(self,
                            shared_network=None,
                            activation='sigmoid',
                            loss='binary_crossentropy'):
        if self.rl_method == 'ddpg':
            print("actor")
            self.actor = ActorNetwork(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      num_steps=self.num_steps,
                                      activation=activation,
                                      loss=loss,
                                      lr=self.lr)
            print(self.actor)
        elif self.net == 'dnn':
            self.policy_network = DNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        elif self.net == 'lstm':
            self.policy_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr,
                num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation,
                loss=loss)
        elif self.net == 'cnn':
            self.policy_network = CNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      num_steps=self.num_steps,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        elif self.net == 'cnn':
            self.policy_network = CNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      num_steps=self.num_steps,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        if self.reuse_models and \
                os.path.exists(self.policy_network_path):
            self.policy_network.load_model(model_path=self.policy_network_path)

    def init_value_network(self,
                           shared_network=None,
                           activation='linear',
                           loss='mse'):
        if self.rl_method == 'ddpg':
            self.critic = CriticNetwork(input_dim=self.num_features,
                                        output_dim=self.agent.NUM_ACTIONS,
                                        num_steps=self.num_steps,
                                        activation=activation,
                                        loss=loss,
                                        lr=self.lr)
        elif self.net == 'dnn':
            self.value_network = DNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'lstm':
            self.value_network = LSTMNetwork(input_dim=self.num_features,
                                             output_dim=self.agent.NUM_ACTIONS,
                                             lr=self.lr,
                                             num_steps=self.num_steps,
                                             shared_network=shared_network,
                                             activation=activation,
                                             loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'cnn':
            self.value_network = CNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     num_steps=self.num_steps,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)

        if self.reuse_models and \
                os.path.exists(self.value_network_path):
            self.value_network.load_model(model_path=self.value_network_path)

    def reset(self):
        self.sample = None
        self.training_data_idx = -1
        # 환경 초기화
        self.environment.reset()
        # 에이전트 초기화
        self.agent.reset()
        # 가시화 초기화
        self.visualizer.clear([0, len(self.chart_data)])
        # 메모리 초기화
        self.memory_sample = []
        self.memory_action = []
        self.memory_target_policy = []
        self.memory_target_value = []
        self.memory_target_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보 초기화
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0

    def build_sample(self):
        self.environment.observe()
        if len(self.training_data) > self.training_data_idx + 1:
            self.training_data_idx += 1
            self.sample = self.training_data.iloc[
                self.training_data_idx].tolist()
            self.sample.extend(self.agent.get_states())
            return self.sample
        return None

    @abc.abstractmethod
    def get_batch(self, batch_size, delayed_reward, discount_factor):
        pass

    @abc.abstractmethod
    def train(self, batch_size, delayed_reward, discount_factor):
        pass

    def update_networks(self, batch_size, delayed_reward, discount_factor):
        # 배치 학습 데이터 생성
        x, y_value, y_policy = self.get_batch(batch_size, delayed_reward,
                                              discount_factor)
        if len(x) > 0:
            loss = 0
            if y_value is not None:
                # 가치 신경망 갱신
                loss += self.critic.train_on_batch(x, y_value)
                self.critic.transfer_weights()
            if y_policy is not None:
                # 정책 신경망 갱신
                loss += self.actor.train_on_batch(x, y_policy)
                self.actor.transfer_weights()
            return loss
        return None

    def fit(self, delayed_reward, discount_factor, full=False):
        batch_size = len(self.memory_reward) if full \
            else self.batch_size
        # 배치 학습 데이터 생성 및 신경망 갱신
        if batch_size > 0:
            _loss = self.update_networks(batch_size, delayed_reward,
                                         discount_factor)
            if _loss is not None:
                self.loss += abs(_loss)
                self.learning_cnt += 1
                self.memory_learning_idx.append(self.training_data_idx)
            self.batch_size = 0

    def visualize(self, epoch_str, num_epoches, epsilon):
        self.memory_action = [Agent.ACTION_HOLD] \
                             * (self.num_steps - 1) + self.memory_action
        self.memory_num_stocks = [0] * (self.num_steps - 1) \
                                 + self.memory_num_stocks
        if self.value_network is not None:
            self.memory_value = [np.array([np.nan] \
                                          * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                                + self.memory_value
        if self.policy_network is not None:
            self.memory_policy = [np.array([np.nan] \
                                           * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                                 + self.memory_policy
        self.memory_pv = [self.agent.initial_balance] \
                         * (self.num_steps - 1) + self.memory_pv
        self.visualizer.plot(
            epoch_str=epoch_str,
            num_epoches=num_epoches,
            epsilon=epsilon,
            action_list=Agent.ACTIONS,
            actions=self.memory_action,
            num_stocks=self.memory_num_stocks,
            outvals_value=self.memory_value,
            outvals_policy=self.memory_policy,
            exps=self.memory_exp_idx,
            learning_idxes=self.memory_learning_idx,
            initial_balance=self.agent.initial_balance,
            pvs=self.memory_pv,
        )
        self.visualizer.save(
            os.path.join(self.epoch_summary_dir,
                         'epoch_summary_{}.png'.format(epoch_str)))

    def run(self,
            num_epoches=100,
            balance=10000000,
            discount_factor=0.9,
            start_epsilon=0.5,
            learning=True):
        info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \
               "DF:{discount_factor} TU:[{min_trading_unit}," \
               "{max_trading_unit}] DRT:{delayed_reward_threshold}".format(
            code=self.stock_code, rl=self.rl_method, net=self.net,
            lr=self.lr, discount_factor=discount_factor,
            min_trading_unit=self.agent.min_trading_unit,
            max_trading_unit=self.agent.max_trading_unit,
            delayed_reward_threshold=self.agent.delayed_reward_threshold
        )
        with self.lock:
            logging.info(info)

        # 시작 시간
        time_start = time.time()

        # 가시화 준비
        # 차트 데이터는 변하지 않으므로 미리 가시화
        self.visualizer.prepare(self.environment.chart_data, info)

        # 가시화 결과 저장할 폴더 준비
        self.epoch_summary_dir = os.path.join(
            self.output_path, 'epoch_summary_{}'.format(self.stock_code))
        if not os.path.isdir(self.epoch_summary_dir):
            os.makedirs(self.epoch_summary_dir)
        else:
            for f in os.listdir(self.epoch_summary_dir):
                os.remove(os.path.join(self.epoch_summary_dir, f))

        # 에이전트 초기 자본금 설정
        self.agent.set_balance(balance)

        # 학습에 대한 정보
        max_portfolio_value = 0
        epoch_win_cnt = 0

        # 학습 반복
        for epoch in range(num_epoches):
            time_start_epoch = time.time()

            # step 샘플을 만들기 위한 큐
            q_sample = collections.deque(maxlen=self.num_steps)

            # 환경, 에이전트, 신경망, 가시화, 메모리 초기화
            self.reset()
            # 학습을 진행할 수록 탐험 비율 감소
            if learning:
                epsilon = start_epsilon \
                          * (1. - float(epoch) / (num_epoches - 1))
                self.agent.reset_exploration()
            else:
                epsilon = start_epsilon
            while True:
                # 샘플 생성
                next_sample = self.build_sample()
                if next_sample is None:
                    break

                # num_steps만큼 샘플 저장
                q_sample.append(next_sample)
                if len(q_sample) < self.num_steps:
                    continue

                # 가치, 정책 신경망 예측
                pred_value = None
                pred_policy = None
                pred_target_policy = None
                pred_target_value = None
                if self.critic is not None:
                    pred_value = self.critic.predict(list(q_sample))
                    pred_target_value = self.critic.target_predict(
                        list(q_sample))
                if self.actor is not None:
                    pred_policy = self.actor.predict(list(q_sample))
                    pred_target_policy = self.actor.target_predict(
                        list(q_sample))

                # 신경망 또는 탐험에 의한 행동 결정
                action, confidence, exploration = \
                    self.agent.decide_action(pred_value, pred_policy, epsilon)

                # target 값을 이용한 행동 결정
                target_action, target_confidence, target_exploration = \
                    self.agent.decide_action(pred_target_policy, pred_target_value, epsilon)

                #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득
                immediate_reward, delayed_reward = \
                    self.agent.act(action, confidence)

                # 행동 및 행동에 대한 결과를 기억
                self.memory_sample.append(list(q_sample))
                self.memory_action.append(action)
                self.memory_reward.append(immediate_reward)
                self.memory_target_action.append(target_action)
                self.memory_target_policy.append(pred_target_policy)
                self.memory_target_value.append(pred_target_value)
                if self.value_network is not None:
                    self.memory_value.append(pred_value)
                if self.policy_network is not None:
                    self.memory_policy.append(pred_policy)
                self.memory_pv.append(self.agent.portfolio_value)
                self.memory_num_stocks.append(self.agent.num_stocks)
                if exploration:
                    self.memory_exp_idx.append(self.training_data_idx)

                # 반복에 대한 정보 갱신
                self.batch_size += 1
                self.itr_cnt += 1
                self.exploration_cnt += 1 if exploration else 0

                # 지연 보상 발생된 경우 미니 배치 학습
                if learning and (delayed_reward != 0):
                    self.fit(delayed_reward, discount_factor)
            # 에포크 종료 후 학습
            if learning:
                self.fit(self.agent.profitloss, discount_factor, full=True)
            # 에포크 관련 정보 로그 기록
            num_epoches_digit = len(str(num_epoches))
            epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0')
            time_end_epoch = time.time()
            elapsed_time_epoch = time_end_epoch - time_start_epoch
            if self.learning_cnt > 0:
                logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} "
                             "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} "
                             "#Stocks:{} PV:{:,.0f} "
                             "LC:{} Loss:{:.6f} ET:{:.4f}".format(
                                 self.stock_code, epoch_str, num_epoches,
                                 epsilon, self.exploration_cnt, self.itr_cnt,
                                 self.agent.num_buy, self.agent.num_sell,
                                 self.agent.num_hold, self.agent.num_stocks,
                                 self.agent.portfolio_value, self.learning_cnt,
                                 self.loss, elapsed_time_epoch))

            # 에포크 관련 정보 가시화
            self.visualize(epoch_str, num_epoches, epsilon)

            # 학습 관련 정보 갱신
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

        # 종료 시간
        time_end = time.time()
        elapsed_time = time_end - time_start

        # 학습 관련 정보 로그 기록
        with self.lock:
            logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} "
                         "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format(
                             code=self.stock_code,
                             elapsed_time=elapsed_time,
                             max_pv=max_portfolio_value,
                             cnt_win=epoch_win_cnt))

    def save_models(self):
        if self.value_network is not None and \
                self.value_network_path is not None:
            self.value_network.save_model(self.value_network_path)
        if self.policy_network is not None and \
                self.policy_network_path is not None:
            self.policy_network.save_model(self.policy_network_path)
Ejemplo n.º 2
0
class DDPG(object):
    """Implementation of the deep deterministic policy gradient algorithm"""
    def __init__(self,
                 docker_client,
                 name='worker',
                 port=3101,
                 model_path='../models/ddpg',
                 log_path='../logs/ddpg'):

        self.state_size = 29
        self.action_size = 3

        self.docker_client = docker_client

        self.buffer_size = 100000
        self.batch_size = 32
        self.gamma = 0.99  # disocunt factor
        self.tau = 0.001  # Target Network HyperParameters
        self.lra = 0.0001  # Learning rate for Actor
        self.lrc = 0.001  # Lerning rate for Critic
        seed(6486)

        self.explore = 100000.
        self.episode_count = 2000
        self.max_steps = 10000
        self.epsilon = 1

        self.model_path = model_path
        self.port = port
        self.name = name

        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        tf.reset_default_graph()

        self.summary_writer = tf.summary.FileWriter(log_path)

        self.actor = ActorNetwork(self.state_size, self.action_size,
                                  tf.train.AdamOptimizer(self.lra), self.tau)

        self.critic = CriticNetwork(self.state_size, self.action_size,
                                    tf.train.AdamOptimizer(self.lrc), self.tau)

        self.buff = ReplayBuffer(self.buffer_size)
        self.saver = tf.train.Saver()
        self._create_summary()
        self.summary_histogram = tf.summary.merge_all()

    def _create_summary(self):

        with tf.name_scope('summary'):
            self.loss_summary_op = tf.summary.scalar('loss',
                                                     self.critic.loss,
                                                     collections=['loss'])

            self.reward_ph = tf.placeholder(shape=[
                None,
            ],
                                            name='reward',
                                            dtype=tf.float32)
            self.target_q_values_ph = tf.placeholder(
                shape=[None, self.action_size],
                name='target_q_values',
                dtype=tf.float32)
            self.y_t_ph = tf.placeholder(shape=[None, self.action_size],
                                         name='target_y_t',
                                         dtype=tf.float32)

            tf.summary.scalar('reward',
                              tf.reduce_mean(self.reward_ph),
                              collections=['reward'])
            tf.summary.scalar('target_q_values',
                              tf.reduce_mean(self.target_q_values_ph),
                              collections=['reward'])
            tf.summary.scalar('y_t',
                              tf.reduce_mean(self.y_t_ph),
                              collections=['reward'])

            self.reward_summary_op = tf.summary.merge_all('reward')

    @staticmethod
    def addOUNoise(a, epsilon):
        """Adds noise from an Ornstein Uhlenbeck process to the actions"""
        def ou_func(x, mu, theta, sigma):
            return theta * (mu - x) + sigma * randn(1)

        a_new = np.zeros(np.shape(a))
        noise = np.zeros(np.shape(a))

        noise[0] = (max(epsilon, 0) * ou_func(a[0], 0.0, 0.60, 0.30))
        noise[1] = (max(epsilon, 0) * ou_func(a[1], 0.5, 1.00, 0.10))
        noise[2] = (max(epsilon, 0) * ou_func(a[2], -0.1, 1.00, 0.10))

        a_new[0] = a[0] + noise[0]
        a_new[1] = a[1] + noise[1]
        a_new[2] = a[2] + noise[2]

        return a_new

    def train(self, track_name='', check_stuck=True):

        all_steps = 0

        if track_name == '':
            env = TorcsDockerEnv(self.docker_client,
                                 self.name,
                                 self.port,
                                 training=True)
        else:
            env = TorcsDockerEnv(self.docker_client,
                                 self.name,
                                 self.port,
                                 track_name=track_name)

        with tf.Session(config=self.config) as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.latest_checkpoint(self.model_path)
            if ckpt:
                print('load model weights from {}'.format(ckpt))
                self.saver.restore(sess, ckpt)

            for i in range(self.episode_count):

                # collect the recent rewards
                recent_rewards = np.ones(1000) * 1e9
                print("Episode : " + str(i) + " Replay Buffer " +
                      str(self.buff.count()))

                if np.mod(i, 3) == 0:
                    observation = env.reset(relaunch=True)
                else:
                    observation = env.reset()

                state_t = obs_to_state(observation)
                total_reward = 0

                for j in range(self.max_steps):
                    loss = 0

                    # reduce the effect of the OU process with progess in the
                    # algorithm
                    self.epsilon -= 1.0 / self.explore

                    action_t = self.actor.predict(
                        sess, state_t.reshape(1, state_t.shape[0]))

                    observation, reward_t, done, _ = env.step(
                        DDPG.addOUNoise(action_t[0], self.epsilon))
                    state_t1 = obs_to_state(observation)

                    # check if we need to terminate, bc the agent is stuck
                    recent_rewards[j % 1000] = reward_t
                    if (check_stuck and np.median(recent_rewards) < 1.0
                            and i / self.episode_count < 0.5):
                        break

                    self.buff.add(state_t, action_t[0], reward_t, state_t1,
                                  done)
                    batch = self.buff.getBatch(self.batch_size)
                    states = np.asarray([e[0] for e in batch])
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    new_states = np.asarray([e[3] for e in batch])
                    dones = np.asarray([e[4] for e in batch])
                    y_t = np.asarray([e[1] for e in batch])

                    target_q_values = self.critic.target_predict(
                        sess, new_states,
                        self.actor.target_predict(sess, new_states))

                    for k in range(len(batch)):
                        if dones[k]:
                            y_t[k] = rewards[k]
                        else:
                            y_t[k] = (rewards[k] +
                                      self.gamma * target_q_values[k])

                    loss += self.critic.train(sess, y_t, states, actions)
                    actions_for_grad = self.actor.predict(sess, states)
                    grads = self.critic.gradients(sess, states,
                                                  actions_for_grad)
                    self.actor.train(sess, states, grads)
                    self.actor.target_train(sess)
                    self.critic.target_train(sess)

                    all_steps += 1

                    if j % 50:

                        loss_summary, reward_summary, histogram = sess.run(
                            [
                                self.loss_summary_op, self.reward_summary_op,
                                self.summary_histogram
                            ],
                            feed_dict={
                                self.critic.expected_critic: y_t,
                                self.critic.state: states,
                                self.actor.state: states,
                                self.actor.target_state: states,
                                self.critic.action: actions,
                                self.reward_ph: rewards,
                                self.target_q_values_ph: target_q_values,
                                self.y_t_ph: y_t
                            })

                        self.summary_writer.add_summary(
                            loss_summary, all_steps)
                        self.summary_writer.add_summary(
                            reward_summary, all_steps)
                        self.summary_writer.add_summary(histogram, all_steps)
                        self.summary_writer.flush()

                    total_reward += reward_t
                    state_t = state_t1
                    print("Episode", i, "Step", all_steps, "Action", action_t,
                          "Reward", reward_t, "Loss", loss)
                    if done:
                        break

                print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
                      str(total_reward))
                print("Total Step: " + str(all_steps))
                print("")

                if np.mod(i, 50) == 0:
                    self.saver.save(
                        sess, self.model_path + '/model-{:d}.cptk'.format(i))
        env.end()