Beispiel #1
0
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
Beispiel #2
0
class DDPG(object):
    """Implementation of the deep deterministic policy gradient algorithm"""
    def __init__(self,
                 docker_client,
                 name='worker',
                 port=3101,
                 model_path='../models/ddpg',
                 log_path='../logs/ddpg'):

        self.state_size = 29
        self.action_size = 3

        self.docker_client = docker_client

        self.buffer_size = 100000
        self.batch_size = 32
        self.gamma = 0.99  # disocunt factor
        self.tau = 0.001  # Target Network HyperParameters
        self.lra = 0.0001  # Learning rate for Actor
        self.lrc = 0.001  # Lerning rate for Critic
        seed(6486)

        self.explore = 100000.
        self.episode_count = 2000
        self.max_steps = 10000
        self.epsilon = 1

        self.model_path = model_path
        self.port = port
        self.name = name

        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        tf.reset_default_graph()

        self.summary_writer = tf.summary.FileWriter(log_path)

        self.actor = ActorNetwork(self.state_size, self.action_size,
                                  tf.train.AdamOptimizer(self.lra), self.tau)

        self.critic = CriticNetwork(self.state_size, self.action_size,
                                    tf.train.AdamOptimizer(self.lrc), self.tau)

        self.buff = ReplayBuffer(self.buffer_size)
        self.saver = tf.train.Saver()
        self._create_summary()
        self.summary_histogram = tf.summary.merge_all()

    def _create_summary(self):

        with tf.name_scope('summary'):
            self.loss_summary_op = tf.summary.scalar('loss',
                                                     self.critic.loss,
                                                     collections=['loss'])

            self.reward_ph = tf.placeholder(shape=[
                None,
            ],
                                            name='reward',
                                            dtype=tf.float32)
            self.target_q_values_ph = tf.placeholder(
                shape=[None, self.action_size],
                name='target_q_values',
                dtype=tf.float32)
            self.y_t_ph = tf.placeholder(shape=[None, self.action_size],
                                         name='target_y_t',
                                         dtype=tf.float32)

            tf.summary.scalar('reward',
                              tf.reduce_mean(self.reward_ph),
                              collections=['reward'])
            tf.summary.scalar('target_q_values',
                              tf.reduce_mean(self.target_q_values_ph),
                              collections=['reward'])
            tf.summary.scalar('y_t',
                              tf.reduce_mean(self.y_t_ph),
                              collections=['reward'])

            self.reward_summary_op = tf.summary.merge_all('reward')

    @staticmethod
    def addOUNoise(a, epsilon):
        """Adds noise from an Ornstein Uhlenbeck process to the actions"""
        def ou_func(x, mu, theta, sigma):
            return theta * (mu - x) + sigma * randn(1)

        a_new = np.zeros(np.shape(a))
        noise = np.zeros(np.shape(a))

        noise[0] = (max(epsilon, 0) * ou_func(a[0], 0.0, 0.60, 0.30))
        noise[1] = (max(epsilon, 0) * ou_func(a[1], 0.5, 1.00, 0.10))
        noise[2] = (max(epsilon, 0) * ou_func(a[2], -0.1, 1.00, 0.10))

        a_new[0] = a[0] + noise[0]
        a_new[1] = a[1] + noise[1]
        a_new[2] = a[2] + noise[2]

        return a_new

    def train(self, track_name='', check_stuck=True):

        all_steps = 0

        if track_name == '':
            env = TorcsDockerEnv(self.docker_client,
                                 self.name,
                                 self.port,
                                 training=True)
        else:
            env = TorcsDockerEnv(self.docker_client,
                                 self.name,
                                 self.port,
                                 track_name=track_name)

        with tf.Session(config=self.config) as sess:
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.latest_checkpoint(self.model_path)
            if ckpt:
                print('load model weights from {}'.format(ckpt))
                self.saver.restore(sess, ckpt)

            for i in range(self.episode_count):

                # collect the recent rewards
                recent_rewards = np.ones(1000) * 1e9
                print("Episode : " + str(i) + " Replay Buffer " +
                      str(self.buff.count()))

                if np.mod(i, 3) == 0:
                    observation = env.reset(relaunch=True)
                else:
                    observation = env.reset()

                state_t = obs_to_state(observation)
                total_reward = 0

                for j in range(self.max_steps):
                    loss = 0

                    # reduce the effect of the OU process with progess in the
                    # algorithm
                    self.epsilon -= 1.0 / self.explore

                    action_t = self.actor.predict(
                        sess, state_t.reshape(1, state_t.shape[0]))

                    observation, reward_t, done, _ = env.step(
                        DDPG.addOUNoise(action_t[0], self.epsilon))
                    state_t1 = obs_to_state(observation)

                    # check if we need to terminate, bc the agent is stuck
                    recent_rewards[j % 1000] = reward_t
                    if (check_stuck and np.median(recent_rewards) < 1.0
                            and i / self.episode_count < 0.5):
                        break

                    self.buff.add(state_t, action_t[0], reward_t, state_t1,
                                  done)
                    batch = self.buff.getBatch(self.batch_size)
                    states = np.asarray([e[0] for e in batch])
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    new_states = np.asarray([e[3] for e in batch])
                    dones = np.asarray([e[4] for e in batch])
                    y_t = np.asarray([e[1] for e in batch])

                    target_q_values = self.critic.target_predict(
                        sess, new_states,
                        self.actor.target_predict(sess, new_states))

                    for k in range(len(batch)):
                        if dones[k]:
                            y_t[k] = rewards[k]
                        else:
                            y_t[k] = (rewards[k] +
                                      self.gamma * target_q_values[k])

                    loss += self.critic.train(sess, y_t, states, actions)
                    actions_for_grad = self.actor.predict(sess, states)
                    grads = self.critic.gradients(sess, states,
                                                  actions_for_grad)
                    self.actor.train(sess, states, grads)
                    self.actor.target_train(sess)
                    self.critic.target_train(sess)

                    all_steps += 1

                    if j % 50:

                        loss_summary, reward_summary, histogram = sess.run(
                            [
                                self.loss_summary_op, self.reward_summary_op,
                                self.summary_histogram
                            ],
                            feed_dict={
                                self.critic.expected_critic: y_t,
                                self.critic.state: states,
                                self.actor.state: states,
                                self.actor.target_state: states,
                                self.critic.action: actions,
                                self.reward_ph: rewards,
                                self.target_q_values_ph: target_q_values,
                                self.y_t_ph: y_t
                            })

                        self.summary_writer.add_summary(
                            loss_summary, all_steps)
                        self.summary_writer.add_summary(
                            reward_summary, all_steps)
                        self.summary_writer.add_summary(histogram, all_steps)
                        self.summary_writer.flush()

                    total_reward += reward_t
                    state_t = state_t1
                    print("Episode", i, "Step", all_steps, "Action", action_t,
                          "Reward", reward_t, "Loss", loss)
                    if done:
                        break

                print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
                      str(total_reward))
                print("Total Step: " + str(all_steps))
                print("")

                if np.mod(i, 50) == 0:
                    self.saver.save(
                        sess, self.model_path + '/model-{:d}.cptk'.format(i))
        env.end()
Beispiel #3
0
            batch = replay.sample_batch(BATCH_SIZE)
            batch_state = np.reshape(batch[0], (BATCH_SIZE, s_dim))
            batch_action = np.reshape(batch[1], (BATCH_SIZE, a_dim))
            batch_reward = np.reshape(batch[2], (BATCH_SIZE, 1))
            batch_state_prime = np.reshape(batch[3], (BATCH_SIZE, s_dim))
            batch_terminal = np.reshape(batch[4], (BATCH_SIZE, 1))
            idx = batch[5]

            # set y = r + γQ′
            q_prime = critic.q_target(
                batch_state_prime, actor.act_target(batch_state_prime)
            )
            y = batch_reward + GAMMA*(q_prime * (1 - batch_terminal))

            # update critic by minimizing the loss: l = (y − Q)^2
            loss, _ = critic.train(batch_state, batch_action, y)

            # update replay memory losses
            if PRIORITIZED:
                replay.update(loss, idx)

            # update the actor policy using the sampled policy gradient
            # ∇θμ ≈ ∇aQ(s, a|θQ) * ∇θμ μ(s|θμ)
            actions = actor.act(batch_state)
            gradients = critic.policy_gradients(batch_state, actions)
            actor.train(batch_state, gradients[0])

            # update the target networks
            # θQ′ ←τθQ +(1−τ)θQ′
            actor.update_target_network()
            # θμ′ ←τθμ +(1−τ)θμ′
class ReinforcementLearner:
    __metaclass__ = abc.ABCMeta
    lock = threading.Lock()

    def __init__(self,
                 rl_method='rl',
                 stock_code=None,
                 chart_data=None,
                 training_data=None,
                 min_trading_unit=1,
                 max_trading_unit=2,
                 delayed_reward_threshold=.05,
                 net='dnn',
                 num_steps=1,
                 lr=0.001,
                 value_network=None,
                 policy_network=None,
                 output_path='',
                 reuse_models=True):
        # 인자 확인
        assert min_trading_unit > 0
        assert max_trading_unit > 0
        assert max_trading_unit >= min_trading_unit
        assert num_steps > 0
        assert lr > 0
        # 강화학습 기법 설정
        self.rl_method = rl_method
        # 환경 설정
        self.stock_code = stock_code
        self.chart_data = chart_data
        self.environment = Environment(chart_data)
        # 에이전트 설정
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)
        # 학습 데이터
        self.training_data = training_data
        self.sample = None
        self.training_data_idx = -1
        # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기
        self.num_features = self.agent.STATE_DIM
        if self.training_data is not None:
            self.num_features += self.training_data.shape[1]
        # 신경망 설정
        self.net = net
        self.num_steps = num_steps
        self.lr = lr
        self.value_network = value_network
        self.policy_network = policy_network
        self.reuse_models = reuse_models
        self.critic = value_network
        self.actor = policy_network
        self.tau = 0.001
        # 가시화 모듈
        self.visualizer = Visualizer()
        # 메모리
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_value2 = []
        self.memory_target_policy = []
        self.memory_target_value = []
        self.memory_target_action = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0
        # 로그 등 출력 경로
        self.output_path = output_path
        # for Delayed Policy Update
        self._update_step = 0
        self._target_update_interval = 2

    def init_policy_network(self,
                            shared_network=None,
                            activation='sigmoid',
                            loss='binary_crossentropy'):
        if self.rl_method == 'td3':
            print("actor")
            self.actor = ActorNetwork(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      num_steps=self.num_steps,
                                      activation=activation,
                                      loss=loss,
                                      lr=self.lr)
            print(self.actor)
        elif self.net == 'dnn':
            self.policy_network = DNN(input_dim=self.num_features,
                                      output_dim=self.agent.NUM_ACTIONS,
                                      lr=self.lr,
                                      shared_network=shared_network,
                                      activation=activation,
                                      loss=loss)
        elif self.net == 'lstm':
            self.policy_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr,
                num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation,
                loss=loss)
        if self.reuse_models and \
                os.path.exists(self.policy_network_path):
            self.policy_network.load_model(model_path=self.policy_network_path)

    def init_value_network(self,
                           shared_network=None,
                           activation='linear',
                           loss='mse'):
        if self.rl_method == 'td3':
            self.critic = CriticNetwork(input_dim=self.num_features,
                                        output_dim=self.agent.NUM_ACTIONS,
                                        num_steps=self.num_steps,
                                        activation=activation,
                                        loss=loss,
                                        lr=self.lr)
        elif self.net == 'dnn':
            self.value_network = DNN(input_dim=self.num_features,
                                     output_dim=self.agent.NUM_ACTIONS,
                                     lr=self.lr,
                                     shared_network=shared_network,
                                     activation=activation,
                                     loss=loss)
        elif self.net == 'lstm':
            self.value_network = LSTMNetwork(input_dim=self.num_features,
                                             output_dim=self.agent.NUM_ACTIONS,
                                             lr=self.lr,
                                             num_steps=self.num_steps,
                                             shared_network=shared_network,
                                             activation=activation,
                                             loss=loss)
        if self.reuse_models and \
                os.path.exists(self.value_network_path):
            self.value_network.load_model(model_path=self.value_network_path)

    def reset(self):
        self.sample = None
        self.training_data_idx = -1
        # 환경 초기화
        self.environment.reset()
        # 에이전트 초기화
        self.agent.reset()
        # 가시화 초기화
        self.visualizer.clear([0, len(self.chart_data)])
        # 메모리 초기화
        self.memory_sample = []
        self.memory_action = []
        self.memory_target_policy = []
        self.memory_target_value = []
        self.memory_target_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_value2 = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보 초기화
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0

    def build_sample(self):
        self.environment.observe()
        if len(self.training_data) > self.training_data_idx + 1:
            self.training_data_idx += 1
            self.sample = self.training_data.iloc[
                self.training_data_idx].tolist()
            self.sample.extend(self.agent.get_states())
            return self.sample
        return None

    @abc.abstractmethod
    def get_batch(self, batch_size, delayed_reward, discount_factor):
        pass

    @abc.abstractmethod
    def train(self, batch_size, delayed_reward, discount_factor):
        pass

    def update_networks(self, batch_size, delayed_reward, discount_factor):
        # 배치 학습 데이터 생성
        x, policy, y_value1, y_value2, critic_target = self.get_batch(
            batch_size, delayed_reward, discount_factor)
        if len(x) > 0:
            loss = 0
            loss += self.critic.train(x, y_value2, y_value2, critic_target)
            if self._update_step % self._target_update_interval == 0:
                # update actor
                loss += self.actor.train(x, policy)

                # update target networks
                self.actor.target_update()
                self.critic.target_update()
            self._update_step = self._update_step + 1  #reset 까먹지 않기

            return loss
        return None

    def fit(self, delayed_reward, discount_factor, full=False):
        batch_size = len(self.memory_reward) if full \
            else self.batch_size
        # 배치 학습 데이터 생성 및 신경망 갱신
        if batch_size > 0:
            _loss = self.update_networks(batch_size, delayed_reward,
                                         discount_factor)
            if _loss is not None:
                self.loss += abs(_loss)
                self.learning_cnt += 1
                self.memory_learning_idx.append(self.training_data_idx)
            self.batch_size = 0

    def visualize(self, epoch_str, num_epoches, epsilon):
        self.memory_action = [Agent.ACTION_HOLD] \
                             * (self.num_steps - 1) + self.memory_action
        self.memory_num_stocks = [0] * (self.num_steps - 1) \
                                 + self.memory_num_stocks
        if self.value_network is not None:
            self.memory_value = [np.array([np.nan] \
                                          * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                                + self.memory_value
        if self.policy_network is not None:
            self.memory_policy = [np.array([np.nan] \
                                           * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                                 + self.memory_policy
        self.memory_pv = [self.agent.initial_balance] \
                         * (self.num_steps - 1) + self.memory_pv
        self.visualizer.plot(
            epoch_str=epoch_str,
            num_epoches=num_epoches,
            epsilon=epsilon,
            action_list=Agent.ACTIONS,
            actions=self.memory_action,
            num_stocks=self.memory_num_stocks,
            outvals_value=self.memory_value,
            outvals_policy=self.memory_policy,
            exps=self.memory_exp_idx,
            learning_idxes=self.memory_learning_idx,
            initial_balance=self.agent.initial_balance,
            pvs=self.memory_pv,
        )
        self.visualizer.save(
            os.path.join(self.epoch_summary_dir,
                         'epoch_summary_{}.png'.format(epoch_str)))

    def run(self,
            num_epoches=100,
            balance=10000000,
            discount_factor=0.9,
            start_epsilon=0.5,
            learning=True):
        info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \
               "DF:{discount_factor} TU:[{min_trading_unit}," \
               "{max_trading_unit}] DRT:{delayed_reward_threshold}".format(
            code=self.stock_code, rl=self.rl_method, net=self.net,
            lr=self.lr, discount_factor=discount_factor,
            min_trading_unit=self.agent.min_trading_unit,
            max_trading_unit=self.agent.max_trading_unit,
            delayed_reward_threshold=self.agent.delayed_reward_threshold
        )
        with self.lock:
            logging.info(info)

        # 시작 시간
        time_start = time.time()

        # 가시화 준비
        # 차트 데이터는 변하지 않으므로 미리 가시화
        self.visualizer.prepare(self.environment.chart_data, info)

        # 가시화 결과 저장할 폴더 준비
        self.epoch_summary_dir = os.path.join(
            self.output_path, 'epoch_summary_{}'.format(self.stock_code))
        if not os.path.isdir(self.epoch_summary_dir):
            os.makedirs(self.epoch_summary_dir)
        else:
            for f in os.listdir(self.epoch_summary_dir):
                os.remove(os.path.join(self.epoch_summary_dir, f))

        # 에이전트 초기 자본금 설정
        self.agent.set_balance(balance)

        # 학습에 대한 정보
        max_portfolio_value = 0
        epoch_win_cnt = 0

        # 학습 반복
        for epoch in range(num_epoches):
            time_start_epoch = time.time()

            # step 샘플을 만들기 위한 큐
            q_sample = collections.deque(maxlen=self.num_steps)

            # 환경, 에이전트, 신경망, 가시화, 메모리 초기화
            self.reset()
            # 학습을 진행할 수록 탐험 비율 감소
            if learning:
                epsilon = start_epsilon \
                          * (1. - float(epoch) / (num_epoches - 1))
                self.agent.reset_exploration()
            else:
                epsilon = start_epsilon
            while True:
                # 샘플 생성
                next_sample = self.build_sample()
                if next_sample is None:
                    break

                # num_steps만큼 샘플 저장
                q_sample.append(next_sample)
                if len(q_sample) < self.num_steps:
                    continue

                # 가치, 정책 신경망 예측
                pred_value = None
                pred_value2 = None
                pred_policy = None
                pred_target_policy = None
                pred_target_value = None
                if self.critic is not None:
                    pred_value = self.critic.predict(list(q_sample))
                    pred_value2 = self.critic.predict2(list(q_sample))
                if self.actor is not None:
                    pred_policy = self.actor.predict(list(q_sample))
                    pred_target_policy = self.actor.target_model1_predict(
                        list(q_sample))

                # 신경망 또는 탐험에 의한 행동 결정
                action, confidence, exploration = \
                    self.agent.decide_action(pred_value, pred_policy, epsilon)

                # target 값을 이용한 행동 결정
                target_action, target_confidence, target_exploration = \
                    self.agent.decide_action(pred_target_policy, pred_target_value, epsilon)

                #결정한 행동을 수행하고 즉시 보상과 지연 보상 획득
                immediate_reward, delayed_reward = \
                    self.agent.act(action, confidence)

                # 행동 및 행동에 대한 결과를 기억
                self.memory_sample.append(list(q_sample))
                self.memory_action.append(action)
                self.memory_reward.append(immediate_reward)
                self.memory_target_action.append(target_action)
                self.memory_target_policy.append(pred_target_policy)
                if self.value_network is not None:
                    self.memory_value.append(pred_value)
                    self.memory_value2.append(pred_value2)
                if self.policy_network is not None:
                    self.memory_policy.append(pred_policy)
                self.memory_pv.append(self.agent.portfolio_value)
                self.memory_num_stocks.append(self.agent.num_stocks)
                if exploration:
                    self.memory_exp_idx.append(self.training_data_idx)

                # 반복에 대한 정보 갱신
                self.batch_size += 1
                self.itr_cnt += 1
                self.exploration_cnt += 1 if exploration else 0

                # 지연 보상 발생된 경우 미니 배치 학습
                if learning and (delayed_reward != 0):
                    self.fit(delayed_reward, discount_factor)
            # 에포크 종료 후 학습
            if learning:
                self.fit(self.agent.profitloss, discount_factor, full=True)
            # 에포크 관련 정보 로그 기록
            num_epoches_digit = len(str(num_epoches))
            epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0')
            time_end_epoch = time.time()
            elapsed_time_epoch = time_end_epoch - time_start_epoch
            if self.learning_cnt > 0:
                logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} "
                             "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} "
                             "#Stocks:{} PV:{:,.0f} "
                             "LC:{} Loss:{:.6f} ET:{:.4f}".format(
                                 self.stock_code, epoch_str, num_epoches,
                                 epsilon, self.exploration_cnt, self.itr_cnt,
                                 self.agent.num_buy, self.agent.num_sell,
                                 self.agent.num_hold, self.agent.num_stocks,
                                 self.agent.portfolio_value, self.learning_cnt,
                                 self.loss, elapsed_time_epoch))

            # 에포크 관련 정보 가시화
            self.visualize(epoch_str, num_epoches, epsilon)

            # 학습 관련 정보 갱신
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

        # 종료 시간
        time_end = time.time()
        elapsed_time = time_end - time_start

        # 학습 관련 정보 로그 기록
        with self.lock:
            logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} "
                         "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format(
                             code=self.stock_code,
                             elapsed_time=elapsed_time,
                             max_pv=max_portfolio_value,
                             cnt_win=epoch_win_cnt))

    def save_models(self):
        if self.value_network is not None and \
                self.value_network_path is not None:
            self.value_network.save_model(self.value_network_path)
        if self.policy_network is not None and \
                self.policy_network_path is not None:
            self.policy_network.save_model(self.policy_network_path)
Beispiel #5
0
class Agent(object):
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 action_bound,
                 tau,
                 env,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=64):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.action_bound = action_bound
        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  n_actions=n_actions,
                                  name='Actor')
        self.critic = CriticNetwork(beta,
                                    input_dims,
                                    layer1_size,
                                    layer2_size,
                                    n_actions=n_actions,
                                    name='Critic')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         n_actions=n_actions,
                                         name='TargetActor')
        self.target_critic = CriticNetwork(beta,
                                           input_dims,
                                           layer1_size,
                                           layer2_size,
                                           n_actions=n_actions,
                                           name='TargetCritic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = T.tensor(observation,
                               dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(
            self.actor.device)
        self.actor.train()
        return (mu_prime * T.tensor(self.action_bound)).cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
                                      self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic.device)
        done = T.tensor(done).to(self.critic.device)
        new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device)
        action = T.tensor(action, dtype=T.float).to(self.critic.device)
        state = T.tensor(state, dtype=T.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma * critic_value_[j] * done[j])
        target = T.tensor(target).to(self.critic.device)
        target = target.view(self.batch_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)
        """
        #Verify that the copy assignment worked correctly
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(target_critic_params)
        actor_state_dict = dict(target_actor_params)
        print('\nActor Networks', tau)
        for name, param in self.actor.named_parameters():
            print(name, T.equal(param, actor_state_dict[name]))
        print('\nCritic Networks', tau)
        for name, param in self.critic.named_parameters():
            print(name, T.equal(param, critic_state_dict[name]))
        input()
        """

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def check_actor_params(self):
        current_actor_params = self.actor.named_parameters()
        current_actor_dict = dict(current_actor_params)
        original_actor_dict = dict(self.original_actor.named_parameters())
        original_critic_dict = dict(self.original_critic.named_parameters())
        current_critic_params = self.critic.named_parameters()
        current_critic_dict = dict(current_critic_params)
        print('Checking Actor parameters')

        for param in current_actor_dict:
            print(
                param,
                T.equal(original_actor_dict[param], current_actor_dict[param]))
        print('Checking critic parameters')
        for param in current_critic_dict:
            print(
                param,
                T.equal(original_critic_dict[param],
                        current_critic_dict[param]))
        input()
Beispiel #6
0
                # target Q値を計算.
                target_action = actor.predict_target(s2_batch)
                target_q = critic.predict_target(s2_batch, target_action)

                # critic の target V値を計算.
                targets = []
                for i in range(MINIBATCH_SIZE):
                    if t_batch[i]:
                        # terminal
                        targets.append(r_batch[i])
                    else:
                        targets.append(r_batch[i] + GAMMA * target_q[i])

                # Critic を train.
                #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                pred_q, _ = critic.train(
                    s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1)))

                # Actor を train.
                actor.train(s_batch, a_batch)

                # Update target networks.
                # 数batchに一度にするべき?
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            episode_reward += r
            episode_avg_max_Q += np.amax(pred_q)

            if terminal:
                print('Episode:', ep, 'Reward:', episode_reward)
Beispiel #7
0
class DdpgAgent:
    """
    A Deep Deterministic Policy Gradient Agent.
    Interacts with and learns from the environment.
    """
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """
        Initialize an Agent object.
        
        Params
        ======
            num_agents (int): number of agents observed at the same time. multiple agents are handled within the class.
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """

        if random_seed is not None:
            random.seed(random_seed)
            np.random.seed(random_seed)

        self.t_step = 0  # A counter that increases each time the "step" function is executed
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = ActorNetwork(state_size,
                                        action_size,
                                        USE_BATCH_NORM,
                                        random_seed,
                                        fc1_units=FC1_UNITS,
                                        fc2_units=FC2_UNITS,
                                        fc3_units=FC3_UNITS).to(device)
        self.actor_target = ActorNetwork(state_size,
                                         action_size,
                                         USE_BATCH_NORM,
                                         random_seed,
                                         fc1_units=FC1_UNITS,
                                         fc2_units=FC2_UNITS,
                                         fc3_units=FC3_UNITS).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_ACTOR)
        # self.actor_optimizer = optim.RMSprop(self.actor_local.parameters(), lr=LR_ACTOR,
        #                                      weight_decay=WEIGHT_DECAY_ACTOR)  # Also solves it, but Adam quicker

        # Critic Network (w/ Target Network)
        self.critic_local = CriticNetwork(state_size,
                                          action_size,
                                          USE_BATCH_NORM,
                                          random_seed,
                                          fc1_units=FC1_UNITS,
                                          fc2_units=FC2_UNITS,
                                          fc3_units=FC3_UNITS).to(device)
        self.critic_target = CriticNetwork(state_size,
                                           action_size,
                                           USE_BATCH_NORM,
                                           random_seed,
                                           fc1_units=FC1_UNITS,
                                           fc2_units=FC2_UNITS,
                                           fc3_units=FC3_UNITS).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_CRITIC)
        # self.critic_optimizer = optim.RMSprop(self.critic_local.parameters(), lr=LR_CRITIC,
        #                                       weight_decay=WEIGHT_DECAY_CRITIC)  # Also solves it, but Adam quicker

        # Make sure target is initiated with the same weight as the local network
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.soft_update(self.critic_local, self.critic_target, 1)

        # Setting default modes for the networks
        # Target networks do not need to train, so always eval()
        # Local networks, in training mode, unless altered in code - eg when acting.
        self.actor_local.train()
        self.actor_target.eval()
        self.critic_local.train()
        self.critic_target.eval()

        # Action Noise process (encouraging exploration during training)
        # Could consider parameter noise in future as a potentially better alternative / addition
        if ACTION_NOISE_METHOD == 'initial':
            self.noise = InitialOrnsteinUhlenbeckActionNoise(
                shape=(num_agents, action_size),
                random_seed=random_seed,
                x0=0,
                mu=0,
                theta=NOISE_THETA,
                sigma=NOISE_SIGMA)
        elif ACTION_NOISE_METHOD == 'adjusted':
            self.noise = AdjustedOrnsteinUhlenbeckActionNoise(
                shape=(num_agents, action_size),
                random_seed=random_seed,
                x0=0,
                mu=0,
                sigma=NOISE_SIGMA,
                theta=NOISE_THETA,
                dt=NOISE_DT,
                sigma_delta=NOISE_SIGMA_DELTA,
            )
        else:
            raise ValueError('Unknown action noise method: ' +
                             ACTION_NOISE_METHOD)

        # Replay memory
        self.memory = ReplayBuffer(
            buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            sampling_method=REPLAY_BUFFER_SAMPLING_METHOD,
            random_seed=random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1

        # Save experience / reward
        self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory, every UPDATE_EVERY steps
        if self.t_step % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, states, add_action_noise=False):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval(
        )  # train state is set right before actual training
        with torch.no_grad(
        ):  # All calcs here with no_grad, but many examples didn't do this. Weirdly, this is slower..
            return np.clip(
                self.actor_local(states).cpu().data.numpy() +
                (self.noise.sample() if add_action_noise else 0), -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): reward discount factor
        """

        states, actions, rewards, next_states, dones = experiences
        self.actor_local.train(
        )  # critic_local is always in train state, but actor_local goes into eval with acting

        # Critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if CLIP_GRADIENT_CRITIC:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # Actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        if CLIP_GRADIENT_ACTOR:
            torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # Soft-Update of Target Networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update target model parameters from local model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #8
0
class Learner(tf.Module):
    def __init__(self, logger, replay_buffer):
        super(Learner, self).__init__(name="Learner")
        self.device = Params.DEVICE

        with tf.device(self.device), self.name_scope:
            self.dtype = Params.DTYPE
            self.logger = logger
            self.batch_size = Params.MINIBATCH_SIZE
            self.gamma = Params.GAMMA
            self.tau = Params.TAU
            self.replay_buffer = replay_buffer
            self.priority_beta = tf.Variable(Params.BUFFER_PRIORITY_BETA_START)
            self.running = tf.Variable(True)
            self.n_steps = tf.Variable(0)

            # Init Networks
            self.actor = ActorNetwork(with_target_net=True)
            self.critic = CriticNetwork()

            # Save shared variables
            self.policy_variables = self.actor.tvariables + self.actor.nvariables

    def save_model(self, path):
        self.actor.actor_network.save(path,
                                      overwrite=True,
                                      include_optimizer=False,
                                      save_format="tf")

    @tf.function()
    def run(self):
        print("retracing learner run")

        with tf.device(self.device), self.name_scope:

            # Wait for replay buffer to fill
            tf.cond(
                Params.BUFFER_FROM_REVERB,
                lambda: [],
                lambda: tf.while_loop(
                    lambda: tf.logical_and(
                        tf.less(self.replay_buffer.size(), Params.
                                MINIBATCH_SIZE), self.running),
                    lambda: [],
                    loop_vars=[],
                    parallel_iterations=1,
                ),
            )

            # Do training
            n_steps = tf.while_loop(lambda n_step: tf.logical_and(
                tf.less_equal(n_step, Params.MAX_STEPS_TRAIN), self.running),
                                    self.train_step,
                                    loop_vars=[tf.constant(1)])

            # Save number of performed steps
            self.n_steps.assign(tf.squeeze(n_steps))

    def train_step(self, n_step):
        print("retracing train_step")

        with tf.device(self.device), self.name_scope:

            print("Eager Execution:  ", tf.executing_eagerly())
            print("Eager Keras Model:", self.actor.actor_network.run_eagerly)

            if Params.BUFFER_FROM_REVERB:

                # Get batch from replay memory
                (s_batch, a_batch, _, _, s2_batch, target_z_atoms_batch), weights_batch, idxes_batch = \
                    self.replay_buffer.sample_batch(self.batch_size, self.priority_beta)

            else:

                # Get batch from replay memory
                (s_batch, a_batch, r_batch, t_batch, s2_batch, g_batch), weights_batch, idxes_batch = \
                    self.replay_buffer.sample_batch(self.batch_size, self.priority_beta)

                # Compute targets (bellman update)
                target_z_atoms_batch = tf.where(t_batch, 0., Params.Z_ATOMS)
                target_z_atoms_batch = r_batch + (target_z_atoms_batch *
                                                  g_batch)

            # Predict target Q value by target critic network
            target_action_batch = self.actor.target_actor_network(
                s2_batch, training=False)
            target_q_probs = tf.cast(
                self.critic.target_critic_network(
                    [s2_batch, target_action_batch], training=False),
                self.dtype)

            # Train the critic on given targets
            td_error_batch = self.critic.train(
                x=[s_batch, a_batch],
                target_z_atoms=target_z_atoms_batch,
                target_q_probs=target_q_probs,
                is_weights=weights_batch)

            # Compute actions for state batch
            actions = self.actor.actor_network(s_batch, training=False)

            # Compute and negate action values (to enable gradient ascent)
            values = self.critic.critic_network([s_batch, actions],
                                                training=False)

            # Compute (dq / da * da / dtheta = dq / dtheta) grads (action values grads wrt. actor network weights)
            action_gradients = tf.gradients(values, actions, Params.Z_ATOMS)[0]
            actor_gradients = tf.gradients(actions, self.actor.tvariables,
                                           -action_gradients)

            # Normalize grads element-wise
            actor_gradients = [
                tf.divide(gradient, tf.cast(self.batch_size, self.dtype))
                for gradient in actor_gradients
            ]

            # Apply gradients to actor net
            self.actor.actor_network.optimizer.apply_gradients(
                zip(actor_gradients, self.actor.tvariables))

            # Update target networks
            update_weights(
                self.actor.target_tvariables + self.actor.target_nvariables,
                self.actor.tvariables + self.actor.nvariables, self.tau)
            update_weights(
                self.critic.target_tvariables + self.critic.target_nvariables,
                self.critic.tvariables + self.critic.nvariables, self.tau)

            # Use critic TD error to update priorities
            self.replay_buffer.update_priorities(idxes_batch, td_error_batch)

            # Increment beta value
            self.priority_beta.assign_add(
                Params.BUFFER_PRIORITY_BETA_INCREMENT)

            # Log status
            tf.cond(
                tf.equal(
                    tf.math.mod(n_step, tf.constant(Params.LEARNER_LOG_STEPS)),
                    tf.constant(0)), lambda: self.logger.log_step_learner(
                        n_step,
                        tf.cast(tf.reduce_mean(td_error_batch), Params.DTYPE),
                        self.priority_beta), lambda: None)

            return tf.add(n_step, 1)