class DDPGPrioritizedReplay(DDPG):

    def __init__(self,
                 s_dim, a_dim, a_bound,
                 a_lr=0.001, c_lr=0.001,
                 tau=0.001, gamma=0.9,
                 memory_capacity=5000, batch_size=64,
                 train={'train': True, 'save_iter': None, 'load_point': -1},
                 model_dir='./model',
                 ):
        super(DDPGPrioritizedReplay, self).__init__(
            s_dim=s_dim, a_dim=a_dim, a_bound=a_bound,
            a_lr=a_lr, c_lr=c_lr,
            tau=tau, gamma=gamma,
            memory_capacity=memory_capacity, batch_size=batch_size,
            train=train, model_dir=model_dir,)
        self.memory = Memory(capacity=memory_capacity, batch_size=batch_size, s_dim=s_dim, a_dim=a_dim)

    def learn(self, lock=None):
        # hard replacement
        self._soft_rep_target()
        self._check_save()

        for _ in range(self.update_times):
            self.learn_counter += 1
            if lock is not None: lock.acquire()
            tree_idx, bt, ISWeights = self.memory.sample()

            bs, ba, br, bs_ = bt['s'], bt['a'], bt['r'], bt['s_']
            Vbs, Vba, Vbr, Vbs_, VISW = \
                Variable(torch.from_numpy(bs).float()), Variable(torch.from_numpy(ba).float()), \
                Variable(torch.from_numpy(br).float()), Variable(torch.from_numpy(bs_).float()),\
                Variable(torch.from_numpy(ISWeights).float())

            target_q = Vbr + self.gamma * self.cnet_(Vbs_, self.anet_(Vbs_)).detach()  # not train
            td_errors = self.cnet(Vbs, Vba) - target_q

            # update priority
            abs_errors = torch.abs(td_errors).data.numpy()
            self.memory.batch_update(tree_idx, abs_errors)
            if lock is not None: lock.release()

            c_loss = torch.mean(VISW * torch.pow(td_errors, 2))
            self.copt.zero_grad()
            c_loss.backward()
            self.copt.step()

            policy_loss = -self.cnet(Vbs, self.anet(Vbs)).mean()
            self.aopt.zero_grad()
            policy_loss.backward()
            self.aopt.step()

    def store_transition(self, s, a, r, s_):
        if a.ndim < 2:
            a = a[:, None]
        if r.ndim < 2:
            r = r[:, None]
        s = np.ascontiguousarray(s)
        s_ = np.ascontiguousarray(s_)
        self.memory.store(s, a, r, s_)
Beispiel #2
0
class DeepQNetwork:
    def __init__(self,
                 n_action,
                 n_width,
                 n_height,
                 n_channel,
                 learning_rate=0.0001,
                 reward_decay=0.9,
                 e_greedy=0.9,
                 replace_target_iter=200,
                 memory_size=500,
                 batch_size=32,
                 e_greedy_increment=None,
                 output_graph=True,
                 double_q=True,
                 dueling=True,
                 prioritized=True,
                 sess=None,
                 load_memory=False):

        self.n_action = n_action
        self.n_width = n_width
        self.n_height = n_height
        self.n_channel = n_channel

        self.n_l1 = 64

        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.double_q = double_q
        self.dueling = dueling
        self.prioritized = prioritized
        self.output_graph = output_graph

        self.learn_step_counter = 0

        if self.prioritized:
            self.memory = Memory(capacity=memory_size)
        else:
            self.memory = np.zeros((self.memory_size, n_width * 2 + 2))

        self.graph = tf.Graph()
        self._build_net()

        with self.graph.as_default() as graph:
            self.init = tf.global_variables_initializer()
            self.saver = tf.train.Saver()

            t_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='target_net')
            e_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='eval_net')
            self.replace_target_op = [
                tf.assign(t, e) for t, e in zip(t_params, e_params)
            ]

        if sess is None:
            self.sess = tf.Session(graph=self.graph)
            self.sess.run(self.init)
            # self.sess = tf.Session()
            # self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess

        if self.output_graph:
            self.summary_writer = tf.summary.FileWriter("log/", self.graph)
            # self.summary_writer = tf.summary.FileWriter("log/", self.sess.graph)

        self.cost_his = []

    def _build_net(self):
        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):

            # s = tf.reshape(s, [-1, 1, self.n_width, self.n_channel])
            n_filter = 32

            with tf.variable_scope('conv1') as scope:
                k1 = tf.get_variable('kernel1',
                                     shape=[1, 1, self.n_channel, n_filter],
                                     collections=c_names)
                conv1 = tf.nn.conv2d(s,
                                     k1,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')

            with tf.variable_scope('conv2') as scope:
                k2_1 = tf.get_variable('kernel2_1',
                                       shape=[1, 1, self.n_channel, n_filter],
                                       collections=c_names)
                conv2 = tf.nn.conv2d(s,
                                     k2_1,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')
                k2_2 = tf.get_variable('kernel2_2',
                                       shape=[3, 3, n_filter, n_filter],
                                       collections=c_names)
                conv2 = tf.nn.conv2d(conv2,
                                     k2_2,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')
                k2_3 = tf.get_variable('kernel2_3',
                                       shape=[3, 3, n_filter, n_filter],
                                       collections=c_names)
                conv2 = tf.nn.conv2d(conv2,
                                     k2_3,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')

            with tf.variable_scope('conv3') as scope:
                k3_1 = tf.get_variable('kernel3_1',
                                       shape=[1, 1, self.n_channel, n_filter],
                                       collections=c_names)
                conv3 = tf.nn.conv2d(s,
                                     k3_1,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')
                k3_2 = tf.get_variable('kernel3_2',
                                       shape=[5, 5, n_filter, n_filter],
                                       collections=c_names)
                conv3 = tf.nn.conv2d(conv3,
                                     k3_2,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')

            with tf.variable_scope('conv4') as scrope:
                conv4 = tf.layers.average_pooling2d(s, [1, 3], [1, 1],
                                                    padding='SAME')
                k4 = tf.get_variable('kernel4',
                                     shape=[1, 1, self.n_channel, n_filter],
                                     collections=c_names)
                conv4 = tf.nn.conv2d(conv4,
                                     k4,
                                     strides=[1, 1, 1, 1],
                                     padding='SAME')

            with tf.variable_scope('concat') as scope:
                inception1 = tf.concat([conv1, conv2, conv3, conv4], axis=3)
                bias = tf.get_variable(name='biases',
                                       initializer=tf.constant_initializer(),
                                       shape=[4 * n_filter],
                                       collections=c_names)
                inception1 = tf.nn.relu(tf.nn.bias_add(inception1, bias))
                fc = tf.layers.average_pooling2d(inception1, [1, 8], [1, 8],
                                                 padding='SAME')
                # fc = tf.contrib.layers.flatten(fc)
                fc = tf.reshape(fc,
                                (tf.shape(fc)[0], self.n_width, n_filter * 4))

            with tf.variable_scope('rnn') as scope:
                cell = tf.contrib.rnn.BasicLSTMCell(num_units=n_filter,
                                                    state_is_tuple=True)
                state_in = cell.zero_state(tf.shape(fc)[0], tf.float32)
                # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)
                rnn, state = tf.nn.dynamic_rnn(inputs=fc,
                                               cell=cell,
                                               dtype=tf.float32,
                                               initial_state=state_in)
                fc = state[1]
                # fc = tf.contrib.layers.flatten(rnn)

            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [n_filter, n_l1],
                                     initializer=w_initializer,
                                     collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1],
                                     initializer=b_initializer,
                                     collections=c_names)
                fc = tf.nn.relu(tf.matmul(fc, w1) + b1)

            if self.dueling:
                with tf.variable_scope('Value'):
                    w_out = tf.get_variable('w_out', [n_l1, 1],
                                            initializer=w_initializer,
                                            collections=c_names)
                    b_out = tf.get_variable('b_out', [1, 1],
                                            initializer=b_initializer,
                                            collections=c_names)
                    self.V = tf.matmul(fc, w_out) + b_out

                with tf.variable_scope('Advantage'):
                    w_out = tf.get_variable('w_out', [n_l1, self.n_action],
                                            initializer=w_initializer,
                                            collections=c_names)
                    b_out = tf.get_variable('b_out', [1, self.n_action],
                                            initializer=b_initializer,
                                            collections=c_names)
                    self.A = tf.matmul(fc, w_out) + b_out

                with tf.variable_scope('Q'):
                    out = self.V + (self.A - tf.reduce_mean(
                        self.A, axis=1, keep_dims=True))
            else:
                with tf.variable_scope('l2'):
                    w2 = tf.get_variable('w2', [n_l1, self.n_action],
                                         initializer=w_initializer,
                                         collections=c_names)
                    b2 = tf.get_variable('b2', [1, self.n_action],
                                         initializer=b_initializer,
                                         collections=c_names)
                    out = tf.matmul(l1, w2) + b2

            return out

        # ------------------ build evaluate_net ------------------
        with self.graph.as_default() as graph:

            if self.prioritized:
                self.ISWeights = tf.placeholder(tf.float32, [None, 1],
                                                name='IS_weights')

            self.s = tf.placeholder(
                tf.float32,
                [None, self.n_width, self.n_height, self.n_channel],
                name='s')  # input
            self.q_target = tf.placeholder(
                tf.float32, [None, self.n_action],
                name='Q_target')  # for calculating loss

            with tf.variable_scope('eval_net'):
                c_names, n_l1, w_initializer, b_initializer = \
                    ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], self.n_l1, \
                    tf.contrib.layers.xavier_initializer(), tf.random_normal_initializer()

                self.q_eval = build_layers(self.s, c_names, n_l1,
                                           w_initializer, b_initializer)

            with tf.variable_scope('loss'):
                if self.prioritized:
                    self.abs_errors = tf.reduce_sum(
                        tf.abs(self.q_target - self.q_eval),
                        axis=1)  # for updating Sumtree
                    self.loss = tf.reduce_mean(
                        self.ISWeights *
                        tf.squared_difference(self.q_target, self.q_eval))
                else:
                    self.loss = tf.reduce_mean(
                        tf.squared_difference(self.q_target, self.q_eval))

            with tf.variable_scope('train'):
                self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
                    self.loss)

            # ------------------ build target_net ------------------
            self.s_ = tf.placeholder(
                tf.float32,
                [None, self.n_width, self.n_height, self.n_channel],
                name='s_')  # input
            with tf.variable_scope('target_net'):
                c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

                self.q_next = build_layers(self.s_, c_names, n_l1,
                                           w_initializer, b_initializer)

            with tf.variable_scope('summary') as scope:
                scalar_summary_tags = ['loss_avg', 'e_balance', \
                                     'q_max', 'q_total', 'epsilon', \
                                     'sharpe_ratio', 'n_trades', \
                                     'win', 'win_buy', 'win_sell', \
                                     'max_profit', 'avg_profit', 'max_loss', 'avg_loss', \
                                     'total_profit', 'total_loss', \
                                     'max_holding_period', 'avg_holding_period', \
                                     'avg_profit_holding_period', 'avg_loss_holding_period', \
                                     'max_floating_profit', 'max_floating_loss', \
                                     'max_total_balance', 'profit_make_good', \
                                     'up_buy', 'down_sell', \
                                     'n_buy', 'n_sell', 'reward', 'diff_sharpe']

                self.summary_placeholders = {}
                self.summary_ops = {}

                for tag in scalar_summary_tags:
                    self.summary_placeholders[tag] = tf.placeholder(
                        tf.float32, None, name=tag.replace(' ', '_') + '_0')
                    self.summary_ops[tag] = tf.summary.scalar(
                        tag, self.summary_placeholders[tag])

            # with tf.variable_scope('training_step'):
            #     training_step_mse = tf.summary.scalar('mse', self.loss)
                histogram_summary_tags = ['r_actions']

                for tag in histogram_summary_tags:
                    self.summary_placeholders[tag] = tf.placeholder(
                        'float32', None, name=tag.replace(' ', '_') + '_0')
                    self.summary_ops[tag] = tf.summary.histogram(
                        tag, self.summary_placeholders[tag])

    def store_transition(self, s, a, r, s_):

        # transition = np.hstack((s, [a, r], s_))
        transition = {'s': s, 'a': a, 'r': r, 's_': s_}

        if self.prioritized:  # prioritized replay
            self.memory.store(
                transition)  # have high priority for newly arrived transition
        else:
            if not hasattr(self, 'memory_counter'):
                self.memory_counter = 0
            index = self.memory_counter % self.memory_size
            self.memory[index, :] = transition
            self.memory_counter += 1

    def choose_action(self, observation, random=False):

        if np.random.uniform(
        ) > self.epsilon or random is True:  # choosing action
            action = np.random.randint(0, self.n_action)
        else:
            observation = observation[np.newaxis, :]
            actions_value = self.sess.run(self.q_eval,
                                          feed_dict={self.s: observation})
            action = np.argmax(actions_value)

        return action

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            print('\ntarget_params_replaced\n')

        if self.prioritized:
            tree_idx, batch_memory, ISWeights = self.memory.sample(
                self.batch_size)
        else:
            if self.memory_counter > self.memory_size:
                sample_index = np.random.choice(self.memory_size,
                                                size=self.batch_size)
            else:
                sample_index = np.random.choice(self.memory_counter,
                                                size=self.batch_size)
            batch_memory = self.memory[sample_index, :]

        s = np.array([batch_memory[i]['s'] for i in range(self.batch_size)])
        s_ = np.array([batch_memory[i]['s_'] for i in range(self.batch_size)])

        q_next, q_eval4next = self.sess.run([self.q_next, self.q_eval],
                                            feed_dict={
                                                self.s_: s_,
                                                self.s: s_
                                            })
        # feed_dict={self.s_: batch_memory[:, -self.n_width:],    # next observation
        #            self.s: batch_memory[:, -self.n_width:]})    # next observation

        # q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_width]})
        q_eval = self.sess.run(self.q_eval, feed_dict={self.s: s})

        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        # eval_act_index = batch_memory[:, self.n_width].astype(int)
        eval_act_index = np.array(
            [batch_memory[i]['a'] for i in range(self.batch_size)],
            dtype=np.int32)
        # reward = batch_memory[:, self.n_width + 1]
        reward = np.array(
            [batch_memory[i]['r'] for i in range(self.batch_size)])

        if self.double_q:
            max_act4next = np.argmax(
                q_eval4next, axis=1
            )  # the action that brings the highest value is evaluated by q_eval
            selected_q_next = q_next[
                batch_index,
                max_act4next]  # Double DQN, select q_next depending on above actions
        else:
            selected_q_next = np.max(q_next, axis=1)  # the natural DQN

        q_target[batch_index,
                 eval_act_index] = reward + self.gamma * selected_q_next

        if self.prioritized:
            _, abs_errors, self.cost = self.sess.run(
                [self._train_op, self.abs_errors, self.loss],
                #  feed_dict={self.s: batch_memory[:, :self.n_width],
                feed_dict={
                    self.s: s,
                    self.q_target: q_target,
                    self.ISWeights: ISWeights
                })
            self.memory.batch_update(tree_idx, abs_errors)
        else:
            _, self.cost = self.sess.run([self._train_op, self.loss],
                                         feed_dict={
                                             self.s:
                                             batch_memory[:, :self.n_width],
                                             self.q_target: q_target
                                         })
        self.cost_his.append(self.cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

    def inject_summary(self, tag_dict, episode):

        summary_str_lists = self.sess.run(
            [self.summary_ops[tag] for tag in tag_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in tag_dict.items()
            })

        for summary_str in summary_str_lists:
            self.summary_writer.add_summary(summary_str, episode)

        # self.summary_writer.add_summary(self.param_summary, episode)

    def finish_episode(self, episode, stat):

        if episode > 0:
            injectDict = {
                # scalar
                'loss_avg': self.totalLoss,
                # 'r_balance': realBalance,
                'epsilon': self.epsilon,
                'q_max': self.totalMaxQ,
                'q_total': self.totalQ,
                'r_actions': self.r_actions
            }

            if self.output_graph:
                self.inject_summary(injectDict, episode)

            # self.saveParam(mode = 0)
            # if episode % self.ckptSavePeriod == 0:
            #     self.saveParam(dir = '%d' % (episode), mode = 1)

        self.r_actions = deque()
        self.totalLoss = 0.0
        self.totalQ = 0.0
        self.totalMaxQ = 0.0

    def load(self, step=0):

        print(sys.path)

        # checkpoint_dir = '/Users/cc/Project/Lean/Launcher/bin/Debug/python/oracle/data/'
        checkpoint_dir = './data'

        try:
            ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
            self.learn_step_counter = int(
                os.path.basename(ckpt.model_checkpoint_path).split('-')[1])
        except:
            ckpt = None

        if not (ckpt and ckpt.model_checkpoint_path):
            print('Cannot find any saved sess in checkpoint_dir')
            #sys.exit(2)
        else:
            try:
                # self.saver = tf.train.Saver()
                self.saver.restore(self.sess, ckpt.model_checkpoint_path)
                self.summary_writer.add_session_log(
                    tf.SessionLog(status=tf.SessionLog.START),
                    global_step=step)
                print('Sess restored successfully: {}'.format(
                    ckpt.model_checkpoint_path))
            except Exception as e:
                print('Failed to load sess: {}'.format(str(e)))
                # sys.exit(2)
                self.learn_step_counter = 1

    def save(self, path=None):

        if (path is not None):
            save_path = path
        else:
            save_path = './data/sess.ckpt'

        self.saver.save(self.sess,
                        save_path,
                        global_step=self.learn_step_counter)
        print('Saving sess to {}: {}'.format(save_path,
                                             self.learn_step_counter))
Beispiel #3
0
class Agent(object):
    def __init__(self,
                 n_s,
                 n_a,
                 hiddens=(128, 64),
                 epsilon=1.0,
                 epsilon_min=0.005,
                 epsilon_decay=0.05,
                 gamma=0.99,
                 batch_size=64,
                 memory_capacity=100000,
                 lr=0.001,
                 is_dueling=False,
                 is_prioritize=True,
                 replace_iter=100,
                 is_soft=False,
                 tau=0.01,
                 e=0.01,
                 a=0.6,
                 b=0.4):
        self.n_s = n_s
        self.n_a = n_a
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.replace_iter = replace_iter
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.is_soft = is_soft
        self.is_prioritize = is_prioritize
        self.tau = tau
        if use_gpu:
            self.eval_net = Net(n_s,
                                n_a,
                                hiddens=hiddens,
                                is_dueling=is_dueling).cuda()
            self.target_net = Net(n_s,
                                  n_a,
                                  hiddens=hiddens,
                                  is_dueling=is_dueling).cuda()
        else:
            self.eval_net = Net(n_s,
                                n_a,
                                hiddens=hiddens,
                                is_dueling=is_dueling)
            self.target_net = Net(n_s,
                                  n_a,
                                  hiddens=hiddens,
                                  is_dueling=is_dueling)
        if is_prioritize:
            self.memory = Memory(memory_capacity, e, a, b)
        else:
            self.memory = np.zeros((memory_capacity, self.n_s * 2 + 2))
        self.memory_count = 0
        self.learn_count = 0

        self.loss_func = nn.MSELoss()
        self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.lr)

    def act(self, s):
        if np.random.random() <= self.epsilon:
            # random
            return np.random.randint(self.n_a)
        else:
            # max
            s = FloatTensor(s)
            action_value = self.eval_net(s)
            a = torch.max(action_value, 1)[1].data.cpu().numpy()[0]
            return a

    def step(self, s, a, r, s_, done):
        if self.is_prioritize:
            # experience = s, a, r, s_, done
            experience = np.hstack((s, [a, r], s_))
            self.memory.store(experience)
            self.memory_count += 1
            if np.count_nonzero(self.memory.tree.tree) > self.batch_size:
                tree_idx, batch, ISWeights_mb = self.memory.sample(
                    self.batch_size)
                self.learn(batch, tree_idx, ISWeights_mb)
        else:
            transition = np.hstack((s, [a, r], s_))
            # replace the old memory with new memory
            index = self.memory_count % self.memory_capacity
            self.memory[index, :] = transition
            self.memory_count += 1
            if self.memory_count < self.memory_capacity:
                return
            # sample batch transitions
            sample_index = np.random.choice(self.memory_capacity,
                                            self.batch_size)
            batch = self.memory[sample_index, :]
            self.learn(batch)

    def learn(self, batch, tree_idx=None, ISWeights_mb=None):
        b_s = torch.squeeze(FloatTensor(batch[:, :self.n_s]), 0)
        b_a = torch.squeeze(LongTensor(batch[:, self.n_s:self.n_s + 1]), 0)
        b_r = torch.squeeze(FloatTensor(batch[:, self.n_s + 1:self.n_s + 2]),
                            0)
        b_s_ = torch.squeeze(FloatTensor(batch[:, -self.n_s:]), 0)
        temp = self.eval_net(b_s)
        eval_q = torch.gather(temp, 1, b_a)
        next_max_from_eval = self.eval_net(b_s_)
        next_max_from_eval_index = next_max_from_eval.max(1)[1].unsqueeze(1)
        next_actions = self.target_net(b_s_).detach()
        next_max = next_actions.gather(1, next_max_from_eval_index)
        target_q = b_r + self.gamma * next_max  # * (1 - b_done)
        abs_errors = numpy(torch.sum(torch.abs(target_q - eval_q), dim=1))
        loss = self.loss_func(eval_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.is_prioritize:
            self.memory.batch_update(tree_idx=tree_idx, abs_errors=abs_errors)
        self.update()
        self.learn_count += 1

    def update(self):
        next_epsilon = self.epsilon * self.epsilon_decay
        if next_epsilon <= self.epsilon_min:
            self.epsilon = self.epsilon_min
        else:
            self.epsilon = next_epsilon
        if self.is_soft:
            for target_param, local_param in zip(self.target_net.parameters(),
                                                 self.eval_net.parameters()):
                target_param.data.copy_(self.tau * local_param.data +
                                        (1.0 - self.tau) * target_param.data)
        else:
            if self.learn_count % self.replace_iter == 0:
                self.target_net.load_state_dict(self.eval_net.state_dict())

    # save all net
    def save(self, name):
        torch.save(self.eval_net, name)

    # load all net
    def load(self, name):
        return torch.load(name)
Beispiel #4
0
         
         Qs_next_state = sess.run(dqn.output, feed_dict = {dqn.inputs_: next_state_mb})
         
         target_Qs_next_state = sess.run(targetnet.output, feed_dict = {targetnet.inputs_: next_state_mb})
         
         for i in range(0, batch_size):
             
             terminal = done_mb[i]
             action = np.argmax(Qs_next_state[i])
             
             if done:
                 target = reward_mb[i]
                 
             else:
                 target = reward_mb[i] + gamma * target_Qs_next_state[i][action]
             
             target_Qs_batch.append(target)
         target_mb = np.array([each for each in target_Qs_batch])
         
         _, loss, abbs_error = sess.run([dqn.optim, dqn.loss, dqn.absolute_errors], feed_dict = {dqn.inputs_:state_mb,
                                                                                                 dqn.actions_ : action_mb,
                                                                                                 dqn.ISWeights: ISWeights_mb,
                                                                                                 dqn.target_Q: target_mb})
 
         memory.batch_update(tree_idx, abbs_error)
         
         if tau > max_tau:
             
             op_holder = update_target_graph()
             sess.run(op_holder)
         
class Agent:
    def __init__(self, demo_transitions=None):
        replay_buffer_size = config.REPLAY_BUFFER_SIZE
        demo_buffer_size = config.DEMO_BUFFER_SIZE
        # replay_memory stores both demo data and generated data
        self.replay_memory = Memory(capacity=replay_buffer_size, permanent_size=len(demo_transitions))
        # demo_memory only store demo data
        self.demo_memory = Memory(capacity=demo_buffer_size, permanent_size=demo_buffer_size)
        self.epsilon = config.INITIAL_EPSILON
        self.steps_done = 0
        #
        self.target_net = DQN().to(device, dtype=torch.double)
        self.policy_net = DQN().to(device,dtype=torch.double)

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=config.LEARNING_RATE, weight_decay=1)

    def replay_memory_push(self, transitions):
        """
        Add transitions to replay_memory
        :param transitions: List of transitions
        :return:
        """
        for t in transitions:
            self.replay_memory.push(np.array(t, dtype=object))

    def demo_memory_push(self, transitions):
        """
        Add transitions to demo_memory
        :param transitions: List of transitions
        :return:
        """
        for t in transitions:
            self.demo_memory.push(np.array(t, dtype=object))

    def e_greedy_select_action(self, state):
        """

        :param state:
        :return:
        """
        self.epsilon = config.FINAL_EPSILON + (config.INITIAL_EPSILON - config.FINAL_EPSILON) * \
                       np.exp(-1. * self.steps_done / config.EPSILON_DECAY)
        self.steps_done += 1
        if random.random() <= self.epsilon or state is None:
            return random.randint(0, config.ACTION_DIM - 1)
        else:
            if isinstance(state, np.ndarray):
                state = torch.from_numpy(state).to(device, dtype=torch.double)
            return self.policy_net(state.to(device, dtype=torch.double)).max(1)[1].view(1, 1).item()  # TODO:

    def pre_train(self):
        """
        pre train
        :return:
        """
        k = config.PRE_TRAIN_STEP_NUM
        print("Pre training for %d steps." % k)
        # for i in tqdm(range(k)):
        for i in range(k):
            self.train(pre_train=True)
            print('Pretrain steps: %d' % i)
            if i % config.TARGET_UPDATE == 0:
                self.update_target_net()
                print('Target network updated!')
        print("Pre training done for %d steps." % k)

    def train(self, pre_train=False):
        """
        train Q network
        :param pre_train: if used for pre train or not
        :return:
        """

        # choose which memory to use
        mem = self.demo_memory if pre_train else self.replay_memory
        #  sample
        batch_id, batch_data, batch_weight = mem.sample(config.BATCH_SIZE)

        # extract data from each column
        batch = Transition(*zip(*batch_data.tolist()))  #array to list to transform

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8)# TODO: change to target state when appropreiate
        non_final_next_states = torch.cat([torch.Tensor(s.double()) for s in batch.next_state if s is not None]).double()
        state_batch = torch.cat(batch.state).double()
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward).double()
        n_reward_batch = torch.cat(batch.n_reward).double()

        # # Compute Q(s_t, a) - the model computes Q(s_t), the action to take for the next state
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)  # calculate Q(s_t, a, \theta) under the current actions
        next_state_values = torch.zeros(config.BATCH_SIZE, device=device)         # Compute V(s_{t+1}) for all
        # next_state_values[non_final_mask] = self.policy_net(non_final_next_states).data.max(1)[0]  #next maximum state values  #DQN
        action_batch_next_state = self.policy_net(non_final_next_states).max(1)[1].unsqueeze(1)  #DDQN
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, action_batch_next_state).squeeze().detach()  #DDQN
        expected_state_action_values = (next_state_values * config.Q_GAMMA) + reward_batch.squeeze(1)

        # calculating the q loss and n-step return loss
        q_loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1), size_average=False)
        n_step_loss = F.mse_loss(state_action_values, n_reward_batch.unsqueeze(1), size_average=False)
        n_step_loss = 0


        # calculating the supervised loss
        if pre_train:
            action_dim = config.ACTION_DIM
            margins = (torch.ones(action_dim, action_dim) - torch.eye(action_dim)) * config.SU_LOSS_MARGIN
            batch_margins = margins[action_batch.data.squeeze().cpu()]
            state_action_values_with_margin = self.policy_net(state_batch) + batch_margins
            supervised_loss = (state_action_values_with_margin.max(1)[0].unsqueeze(1) - state_action_values).pow(2).sum()
        else:
            supervised_loss = 0.0


        loss = q_loss + config.SU_LOSS_LAMBDA * supervised_loss + config.N_STEP_LOSS_LAMBDA * n_step_loss

        # optimization step and logging
        self.optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 100)
        # self.optimizer.step()

        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        with torch.no_grad():
            abs_errors = torch.sum(torch.abs(state_action_values - expected_state_action_values.unsqueeze(1)), dim=1)
            abs_errors = abs_errors.detach().numpy()

        self.replay_memory.batch_update(batch_id, abs_errors)  # update priorities for data in memory

    def update_target_net(self):
        """

        :return:
        """
        # Update the target network
        self.target_net.load_state_dict(self.policy_net.state_dict())
Beispiel #6
0
class Agent:

    sigma = 0.2
    alpha = 1.01
    epsilon = 0.5
    min_epsilon = 0.01
    name = ["iqn_e", "iqn.h5"]
    custom_objects = custom_objects

    def __init__(self, action_size=3, lr=1e-3, n=3, spread=5, step_size=1000, money=10000, leverage=500, restore=False):
        self.n = n
        self.spread = spread
        self.action_size = action_size
        self.step_size = step_size
        self.lr = lr
        self.money = money
        self.leverage = leverage
        self.restore = restore
        self.build_model = model
        self.memory = Memory(50000)
        self.state()
        self.build()
        self.w = self.model.get_weights()
        self.reset = 0
        self.e = []

    def build(self):
        if self.restore:
            self.i = np.load(f"{self.name[0]}.npy")
            self.model = tf.keras.models.load_model(self.name[1], custom_objects=self.custom_objects)
        else:
            self.i = 0
            self.model = self.build_model(self.x.shape[-2:], self.action_size)
            opt = tfa.optimizers.Lookahead(tf.keras.optimizers.Nadam(self.lr))
            # opt =
            self.model.compile(opt)

        self.target_model = self.build_model(self.x.shape[-2:], self.action_size)
        self.target_model.set_weights(self.model.get_weights())

        get = self.model.get_layer
        self.q = tf.keras.backend.function([get("i").input,get("t").input], get("q").output)
        get = self.target_model.get_layer
        self.target_q = tf.keras.backend.function([get("i").input,get("t").input], get("q").output)

    def state(self):
        t = 1
        x = np.load(f"x{t}.npy")
        shape = x.shape
        self.x = x.reshape((shape[0], -1, shape[-2], shape[-1]))
        y = np.load(f"target{t}.npy")
        shape = y.shape
        y = y.reshape((shape[0], y.shape[2], -1))
        self.y, self.v, self.atr, self.high, self.low = \
            y[:, 0], y[:, 1], y[:, 2], y[:, 3], y[:, 4]

        self.train_step = np.arange(0, int(self.x.shape[1] - self.x.shape[1] * 0.2 - self.step_size), self.step_size)
        # self.train_step = np.arange(0, int(self.x.shape[1] - self.x.shape[1] * 0.2 - self.step_size))
        self.test_step = self.train_step[-1] + self.step_size, self.x.shape[1] - self.step_size
        self.test_step2 = np.arange(self.test_step[0], self.test_step[1], self.step_size)


    def train(self, b = 128):
        tree_idx, replay, isw = self.memory.sample(b)

        self.states = states = np.array([a[0][0] for a in replay], np.float32)
        new_states = np.array([a[0][3] for a in replay], np.float32)
        actions = np.array([a[0][1] for a in replay]).reshape((-1, 1))
        rewards = np.array([a[0][2] for a in replay], np.float32).reshape((-1, 1))
        gamma = np.array([a[0][4] for a in replay]).reshape((-1, 1))

        self.tau = tau = np.random.uniform(0, 1, (len(tree_idx), 32))
        target_tau = np.random.uniform(0, 1, (len(tree_idx), 32))

        target_q = self.target_q([new_states, target_tau])
        target_a = np.argmax(np.sum(self.q([new_states, tau]), -1), -1)

        with tf.GradientTape() as tape:
            q = self.model([states, tau])
            q_backup = q.numpy()

            for i in range(len(tree_idx)):
                q_backup[i, actions[i]] = rewards[i] + gamma[i] * target_q[i, target_a[i]]

            error = q_backup - q
            tau = tau.reshape((-1, 1, 32))

            huber = tf.where(abs(error) <= 2, error ** 2 * .5, .5 * 2 ** 2 + 2 * tf.abs(error) - 2)
            loss = tf.maximum(tau * huber, (tau - 1) * huber)

            error = tf.reduce_sum(tf.reduce_sum(loss, 1), -1)
            loss = tf.reduce_mean(error)
            # loss = tf.reduce_mean(error * isw)

        self.e.append(loss)
        gradients = tape.gradient(loss, self.model.trainable_variables)
        # gradients = [tf.clip_by_value(g, -1, 1) for g in gradients]
        self.model.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        ae = error.numpy().reshape((-1,))
        self.ae = ae
        self.memory.batch_update(tree_idx, ae)

        self.target_model.set_weights(0.005 * np.array(self.model.get_weights()) + 0.995 * np.array(self.target_model.get_weights()))


    def step(self, types=0):
        train = True if types == 0 else False
        step = range(25) if train else range(10)
        self.exp = []

        for _ in step:
            s = 0
            if types == 2:
                h = np.random.randint(self.test_step[0], self.test_step[1])
            else:
                h = np.random.choice(self.train_step)

            self.df = df = self.x[s, h:h + self.step_size]
            self.trend = trend = self.y[s, h:h + self.step_size]
            v = self.v[s, h:h + self.step_size]

            if not train:
                old_a = 0
                lot = 0
                money = self.money
                self.pip = []

                tau = np.random.uniform(0, 1, (self.step_size, 32))
                q = self.q([df, tau])
                q = np.mean(q, -1) / (np.sqrt(np.std(q, -1)) + 1e-10)
                self.a = action = np.argmax(q, -1)
                # action = np.argmax( np.sum( self.q([df, tau]), -1 ), -1)

                for idx, action in zip(range(len(trend) - 1), action):
                    action = 0 if action == 0 else -1 if action == 1 else 1

                    if (action == 1 or action == -1) and lot == 0:
                        lot = (money * 0.05 / (trend[idx] / self.leverage))

                    r = trend[idx + 1] - trend[idx]
                    r = (action * r - self.spread * np.abs(old_a - action)) * lot
                    money += r
                    money = np.clip(money, 0, None)
                    self.pip.append(r)
                    if old_a != action:
                        lot = 0

                    if money <= 0:
                        break

                    old_a = action

                g = ((money - self.money) / self.money) * 100

                self.exp.append(g)

            else:
                gammas = []
                position = 0
                actions = []
                rewards = []
                old_a = 0
                noise_w = [w + np.random.normal(0, self.sigma, w.shape) for w in self.w]
                noise = np.random.normal(0, 0.1, self.action_size)
                self.model.set_weights(noise_w)

                for idx in range(len(trend) -1):
                    df_t = np.array([df[idx]])
                    df_t = np.random.normal(df_t, 0.005)
                    if np.random.rand() > 0.1:
                        tau = np.random.uniform(0, 1, (1, 32))
                        q = self.q([df_t, tau])
                        q = np.mean(q, -1)
                        action = np.argmax(q, -1)[0]
                    else:
                        tau = np.random.uniform(0, 1, (1, 32))
                        q = self.q([df_t, tau])
                        q = np.mean(q, -1)
                        q = np.abs(q) / np.sum(np.abs(q), 1).reshape((-1, 1)) * (np.abs(q) / q)
                        q += noise
                        action = np.argmax(q, -1)[0]


                    action = int(action)
                    actions.append(action)
                    action = action if action == 0 else -1 if action == 1 else 1

                    if old_a == action:
                        r = 0
                        # r = trend[idx + 1] - trend[idx]
                        # r = action * r - self.spread * np.abs(old_a - action)
                        gamma = 0.99
                    elif position != 0:
                        r = trend[idx + 1] - position
                        r = action * r - self.spread# * np.abs(old_a - action)
                        gamma = 0
                        position = 0
                    else:
                        r = 0
                        gamma = 0.99

                    if (action == -1 or action == 1) and position == 0:
                        position = trend[idx]

                    gammas.append(gamma)
                    rewards.append(r)

                    old_a = action

                    if len(rewards) > self.n:
                        r = np.sum(rewards[-self.n:]) * 0.99 ** self.n
                        if gammas[idx - (self.n - 1)] == 0.99 and 0 in gammas[-self.n:]:
                            gammas[idx - (self.n - 1)] = 0.1
                        try:
                            e = df[idx - (self.n - 1)], actions[idx - (self.n - 1)], r, df[idx + self.n], gammas[
                                idx - (self.n - 1)]
                            self.memory.store(e)
                            if (self.restore + 1) % 64 == 0:
                                self.model.set_weights(self.w)
                                self.train()
                                self.w = self.model.get_weights()
                                noise = np.random.normal(0, 0.1, self.action_size)
                            self.restore += 1
                        except:
                            pass

                    if (idx + 1) % (self.step_size // 2) == 0:
                        # 計算コストが高い
                        self.epsilon = np.clip(self.epsilon * 0.99999, 0.05, None)
                        self.threshold = -np.log(1 - self.epsilon + self.epsilon / self.action_size)
                        self.model.set_weights(self.w)
                        q = self.q([self.states, self.tau])
                        q = tf.reduce_mean(q, -1)
                        noise_w = [w + np.random.normal(0, self.sigma, w.shape) for w in self.w]
                        self.model.set_weights(noise_w)
                        qe = self.q([self.states, self.tau])
                        qe = tf.reduce_mean(qe, -1)

                        kl = tf.reduce_sum(
                                        tf.nn.softmax(q) * (
                                        tf.math.log(tf.nn.softmax(q) + 1e-10) - tf.math.log(tf.nn.softmax(qe) + 1e-10)),
                                        axis=-1)

                        mean_kl = np.mean(kl.numpy())
                        self.sigma = self.alpha * self.sigma if mean_kl < self.threshold else 1 / self.alpha * self.sigma
                        noise_w = [w + np.random.normal(0, self.sigma, w.shape) for w in self.w]
                        self.model.set_weights(noise_w)

                self.i += 1
        if train:
            self.model.set_weights(self.w)

    def run(self):
        train_h = []
        test_h = []
        for idx in range(10000):
            start = time.time()
            if idx % 10 == 0:
                self.h = np.random.choice(self.train_step)
            self.step(0)

            train = []
            test = []
            for _ in range(1):
                self.step(1)
                train.extend(self.exp)
                self.step(2)
                test.extend(self.exp)

            print(f"epoch {self.i}")
            print(f"speed {time.time() - start}sec")
            plt.cla()
            train_h.append(np.median(train))
            test_h.append(np.median(test))

            plt.plot(train_h, label="train")
            plt.plot(test_h, label="test")
            plt.show()

            df = pd.DataFrame({"train": np.array(train),
                               "test": np.array(test)})
            print(df.describe())

            np.save(self.name[0], self.i)
            self.model.save(self.name[1])

            try:
                _ = shutil.copy(f"/content/{self.name[1]}", "/content/drive/My Drive")
                _ = shutil.copy(f"/content/{self.name[0]}.npy", "/content/drive/My Drive")
            except:
                pass
Beispiel #7
0
class Model(object):
    def __init__(self, id, env, action_noise=None, action_bounds=(-1., 1.)):
        self.reward_dict = defaultdict(float)
        self.id = id
        self.env = env
        self.pi_fn = PI_FEATURE_NUM
        self.critic_fn = CRITIC_FEATURE_NUM
        self.action_bounds = action_bounds

        self.actor = Actor(self.pi_fn, AGENT_ACTION_CNT)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.reward_net = RewardNet(self.critic_fn, CRITIC_ACTION_NUM)
        self.rn_optim = Adam(self.reward_net.parameters(), lr=CRITIC_LR)

        self.memory = Memory()
        self.action_noise = action_noise

    def pi(self, state, all_memory_ready, apply_noise=True, done=False):
        if done:
            return np.array([0.])

        if not all_memory_ready and apply_noise:
            sigma = np.clip(self.memory.size() / float(MEMORY_MIN_SIZE), 0,
                            1) * ACTION_NOISE_STDDEV
            self.action_noise.set_sigma(sigma)
            return np.clip(
                np.array([0.]) + self.action_noise(), self.action_bounds[0],
                self.action_bounds[1])

        with torch.no_grad():
            obs = get_pi_obs(state, self.id)
            action = float(self.actor(to_tensor(obs)).detach().numpy())

            if self.action_noise is not None and apply_noise:
                noise = self.action_noise()
                action += noise

            action = np.clip(action, self.action_bounds[0],
                             self.action_bounds[1])

            return action

    def is_memory_ready(self):
        return self.memory.size() > MEMORY_MIN_SIZE

    def get_joint_info(self, batch, cur_date):
        interval = int(batch.shape[1] / MAX_GC_CNT)
        index = list(self.env.date_gc_index[cur_date])
        index.sort()
        index = np.array(index)
        if interval == 1:
            return batch[:, index]
        else:
            all_index = index
            for i in range(len(batch) - 1):
                all_index = np.concatenate(
                    [all_index, index + (i + 1) * MAX_GC_CNT])

            return np.reshape(
                np.reshape(batch, (-1, interval))[all_index],
                (-1, interval * len(index)))

    def train(self, all_agents, cur_date):
        if self.memory.size() < BATCH_SIZE * 50:
            return None

        # Get a batch.
        idx, batch, isw = self.memory.sample(batch_size=BATCH_SIZE)

        # Get latest reward, since the reward is updating during iteration
        get_latest_reward(self.reward_dict, batch, self.id)

        self.rn_optim.zero_grad()
        s = to_tensor(batch['obs0'], requires_grad=True)
        a = to_tensor(batch['actions'], requires_grad=True)
        r = to_tensor(batch['rewards'], requires_grad=True)
        q = self.reward_net([s, a + ACTION_SCALE])
        q_loss = torch.nn.MSELoss()(q, r)
        q_loss.backward()
        self.rn_optim.step()

        tderr = np.abs((q - r).detach().numpy())
        self.memory.batch_update(idx, tderr)

        self.actor_optim.zero_grad()
        s = to_tensor(batch['obs0'], requires_grad=True)
        step_left = s[:, 0] * (MAX_STEP + 1)
        a_loss = -(self.reward_net([s, self.actor(s) + ACTION_SCALE]) *
                   step_left).mean()
        a_loss.backward()
        self.actor_optim.step()

        return q_loss.detach().numpy(), a_loss.detach().numpy()
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed = 0, buffer_size = int(1e4), batch_size = 64, gamma = 0.99, tau = 1e-3, lr = 7e-4, update_every = 4):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed,  fc1_units=32, fc2_units=8).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed,  fc1_units=32, fc2_units=8).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = Memory(buffer_size, state_size, alpha = 0.6) # replay buffer size

        # Parameters
        self.batch_size = batch_size # minibatch size
        self.gamma = gamma # discount factor
        self.tau = tau # for soft update of target parameters
        self.update_every = update_every # how often to update the network

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done, i_episode):
        # Save experience in replay memory
        self.memory.store(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if (len(self.memory) >= self.batch_size):
            # If enough samples are available in memory, get radom subset and learn
                self.learn(self.gamma, i_episode)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, gamma, episode_n):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        tree_id, states, actions, rewards, next_states, dones, ISWeights = self.memory.sample(self.batch_size)
        # Double DQN
        # Use local network to select max Q for actions in every experience
        Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1)
        # Use gather to get the same actions but from the Q on target network
        Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max) 
        
        # Normal DQN
        # use target network for selecting next Q value
        # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
    
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        dt_errors = Q_targets - Q_expected
        
        self.memory.batch_update(tree_id, (abs(dt_errors) + 1e-5).cpu().detach().numpy().flatten())

        # Compute loss
        loss =  torch.mul(dt_errors.pow(2), ISWeights)
        loss = torch.mean(loss)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)                     

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
Beispiel #9
0
class DQfD:
    def __init__(self, env, config, demo_transitions=None):
        self.sess = tf.InteractiveSession()
        self.config = config
        # replay_memory stores both demo data and generated data, while demo_memory only store demo data
        self.replay_memory = Memory(capacity=self.config.replay_buffer_size,
                                    permanent_data=len(demo_transitions))
        self.demo_memory = Memory(capacity=self.config.demo_buffer_size,
                                  permanent_data=self.config.demo_buffer_size)
        self.add_demo_to_memory(
            demo_transitions=demo_transitions
        )  # add demo data to both demo_memory & replay_memory
        self.time_step = 0
        self.epsilon = self.config.INITIAL_EPSILON
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.action_batch = tf.placeholder("int32", [None])
        self.y_input = tf.placeholder("float", [None, self.action_dim])
        self.ISWeights = tf.placeholder("float", [None, 1])
        self.n_step_y_input = tf.placeholder(
            "float", [None, self.action_dim])  # for n-step reward
        self.isdemo = tf.placeholder("float", [None])
        self.eval_input = tf.placeholder("float", [None, self.state_dim])
        self.select_input = tf.placeholder("float", [None, self.state_dim])

        self.Q_eval
        self.Q_select

        self.loss
        self.optimize
        self.update_target_net
        self.abs_errors

        self.saver = tf.train.Saver()

        self.sess.run(tf.global_variables_initializer())

        self.save_model()
        self.restore_model()

    def add_demo_to_memory(self, demo_transitions):
        # add demo data to both demo_memory & replay_memory
        for t in demo_transitions:
            self.demo_memory.store(np.array(t, dtype=object))
            self.replay_memory.store(np.array(t, dtype=object))
            assert len(t) == 10

    # use the expert-demo-data to pretrain
    def pre_train(self):
        print('Pre-training ...')
        for i in range(self.config.PRETRAIN_STEPS):
            self.train_Q_network(pre_train=True)
            if i % 200 == 0 and i > 0:
                print('{} th step of pre-train finish ...'.format(i))
        self.time_step = 0
        print('All pre-train finish.')

    # TODO: How to add the variable created in tf.layers.dense to the customed collection?
    # def build_layers(self, state, collections, units_1, units_2, w_i, b_i, regularizer=None):
    #     with tf.variable_scope('dese1'):
    #         dense1 = tf.layers.dense(tf.contrib.layers.flatten(state), activation=tf.nn.relu, units=units_1,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     with tf.variable_scope('dens2'):
    #         dense2 = tf.layers.dense(dense1, activation=tf.nn.relu, units=units_2,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     with tf.variable_scope('dene3'):
    #         dense3 = tf.layers.dense(dense2, activation=tf.nn.relu, units=self.action_dim,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     return dense3

    def build_layers(self,
                     state,
                     c_names,
                     units_1,
                     units_2,
                     w_i,
                     b_i,
                     reg=None):
        a_d = self.action_dim
        with tf.variable_scope('l1'):
            w1 = tf.get_variable('w1', [a_d, units_1],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b1 = tf.get_variable('b1', [1, units_1],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense1 = tf.nn.relu(tf.matmul(state, w1) + b1)
        with tf.variable_scope('l2'):
            w2 = tf.get_variable('w2', [units_1, units_2],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b2 = tf.get_variable('b2', [1, units_2],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense2 = tf.nn.relu(tf.matmul(dense1, w2) + b2)
        with tf.variable_scope('l3'):
            w3 = tf.get_variable('w3', [units_2, a_d],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b3 = tf.get_variable('b3', [1, a_d],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense3 = tf.matmul(dense2, w3) + b3
        return dense3

    @lazy_property
    def Q_select(self):
        with tf.variable_scope('select_net') as scope:
            c_names = ['select_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            w_i = tf.random_uniform_initializer(-0.1, 0.1)
            b_i = tf.constant_initializer(0.1)
            reg = tf.contrib.layers.l2_regularizer(
                scale=0.2)  # Note: only parameters in select-net need L2
            return self.build_layers(self.select_input, c_names, 24, 24, w_i,
                                     b_i, reg)

    @lazy_property
    def Q_eval(self):
        with tf.variable_scope('eval_net') as scope:
            c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            w_i = tf.random_uniform_initializer(-0.1, 0.1)
            b_i = tf.constant_initializer(0.1)
            return self.build_layers(self.eval_input, c_names, 24, 24, w_i,
                                     b_i)

    def loss_l(self, ae, a):
        return 0.0 if ae == a else 0.8

    def loss_jeq(self, Q_select):
        jeq = 0.0
        for i in range(self.config.BATCH_SIZE):
            ae = self.action_batch[i]
            max_value = float("-inf")
            for a in range(self.action_dim):
                max_value = tf.maximum(Q_select[i][a] + self.loss_l(ae, a),
                                       max_value)
            jeq += self.isdemo[i] * (max_value - Q_select[i][ae])
        return jeq

    @lazy_property
    def loss(self):
        l_dq = tf.reduce_mean(
            tf.squared_difference(self.Q_select, self.y_input))
        l_n_dq = tf.reduce_mean(
            tf.squared_difference(self.Q_select, self.n_step_y_input))
        # l_n_step_dq = self.loss_n_step_dq(self.Q_select, self.n_step_y_input)
        l_jeq = self.loss_jeq(self.Q_select)
        l_l2 = tf.reduce_sum([
            tf.reduce_mean(reg_l)
            for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        ])
        return self.ISWeights * tf.reduce_sum([
            l * λ
            for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA)
        ])

    @lazy_property
    def abs_errors(self):
        return tf.reduce_sum(tf.abs(self.y_input - self.Q_select),
                             axis=1)  # only use 1-step R to compute abs_errors

    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE)
        return optimizer.minimize(
            self.loss)  # only parameters in select-net is optimized here

    @lazy_property
    def update_target_net(self):
        select_params = tf.get_collection('select_net_params')
        eval_params = tf.get_collection('eval_net_params')
        return [tf.assign(e, s) for e, s in zip(eval_params, select_params)]

    def save_model(self):
        print("Model saved in : {}".format(
            self.saver.save(self.sess, self.config.MODEL_PATH)))

    def restore_model(self):
        self.saver.restore(self.sess, self.config.MODEL_PATH)
        print("Model restored.")

    def perceive(self, transition):
        self.replay_memory.store(np.array(transition))
        # epsilon->FINAL_EPSILON(min_epsilon)
        if self.replay_memory.full():
            self.epsilon = max(self.config.FINAL_EPSILON,
                               self.epsilon * self.config.EPSILIN_DECAY)

    def train_Q_network(self, pre_train=False, update=True):
        """
        :param pre_train: True means should sample from demo_buffer instead of replay_buffer
        :param update: True means the action "update_target_net" executes outside, and can be ignored in the function
        """
        if not pre_train and not self.replay_memory.full(
        ):  # sampling should be executed AFTER replay_memory filled
            return
        self.time_step += 1

        assert self.replay_memory.full() or pre_train

        actual_memory = self.demo_memory if pre_train else self.replay_memory
        tree_idxes, minibatch, ISWeights = actual_memory.sample(
            self.config.BATCH_SIZE)

        np.random.shuffle(minibatch)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        demo_data = [data[5] for data in minibatch]
        n_step_reward_batch = [data[6] for data in minibatch]
        n_step_state_batch = [data[7] for data in minibatch]
        n_step_done_batch = [data[8] for data in minibatch]
        actual_n = [data[9] for data in minibatch]

        # provide for placeholder,compute first
        Q_select = self.Q_select.eval(
            feed_dict={self.select_input: next_state_batch})
        Q_eval = self.Q_eval.eval(
            feed_dict={self.eval_input: next_state_batch})
        n_step_Q_select = self.Q_select.eval(
            feed_dict={self.select_input: n_step_state_batch})
        n_step_Q_eval = self.Q_eval.eval(
            feed_dict={self.eval_input: n_step_state_batch})

        y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        for i in range(self.config.BATCH_SIZE):
            # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t
            temp = self.Q_select.eval(
                feed_dict={
                    self.select_input: state_batch[i].reshape((-1,
                                                               self.state_dim))
                })[0]
            temp_0 = np.copy(temp)
            # add 1-step reward
            action = np.argmax(Q_select[i])
            temp[action_batch[i]] = reward_batch[i] + (
                1 - int(done_batch[i])) * self.config.GAMMA * Q_eval[i][action]
            y_batch[i] = temp
            # add n-step reward
            action = np.argmax(n_step_Q_select[i])
            q_n_step = (
                1 - int(n_step_done_batch[i])
            ) * self.config.GAMMA**actual_n[i] * n_step_Q_eval[i][action]
            temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step
            n_step_y_batch[i] = temp_0

        _, abs_errors = self.sess.run(
            [self.optimize, self.abs_errors],
            feed_dict={
                self.y_input: y_batch,
                self.n_step_y_input: n_step_y_batch,
                self.select_input: state_batch,
                self.action_batch: action_batch,
                self.isdemo: demo_data,
                self.ISWeights: ISWeights
            })

        self.replay_memory.batch_update(
            tree_idxes, abs_errors)  # update priorities for data in memory

        # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制
        if update and self.time_step % self.config.UPDATE_TARGET_NET == 0:
            self.sess.run(self.update_target_net)

    def egreedy_action(self, state):
        if random.random() <= self.epsilon:
            return random.randint(0, self.action_dim - 1)
        return np.argmax(
            self.Q_select.eval(feed_dict={self.select_input: [state]})[0])
class Dqn_agent:
    def __init__(self, asset_num, division, feature_num, gamma,
                 network_topology, learning_rate, epsilon, epsilon_Min,
                 epsilon_decay_period, update_tar_period, history_length,
                 memory_size, batch_size, save_period, name, save):

        self.epsilon = epsilon
        self.epsilon_min = epsilon_Min
        self.epsilon_decay_period = epsilon_decay_period
        self.asset_num = asset_num
        self.division = division
        self.gamma = gamma
        self.name = name
        self.update_tar_period = update_tar_period
        self.history_length = history_length
        self.feature_num = feature_num
        self.global_step = tf.Variable(0, trainable=False)
        self.lr = learning_rate
        self.cnn_trainable = True
        self.action_num, self.actions = action_discretization(
            self.asset_num, self.division)
        config = tf.ConfigProto()

        self.sess = tf.Session(config=config)

        network_topology['output_num'] = self.action_num

        self.network_config = network_topology
        self.initialize_graph()
        t_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope='target_net')
        e_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope='estm_net')

        # assign parameters of estimate Q-net to target Q-net
        self.update_target = [
            tf.assign(t, l) for t, l in zip(t_params, e_params)
        ]
        self.sess.run(tf.global_variables_initializer())
        self.memory = Memory(self.action_num,
                             self.actions,
                             memory_size=memory_size,
                             batch_size=batch_size)

        if save:
            self.save = save
            self.save_period = save_period
            self.name = name
            self.saver = tf.train.Saver()
        else:
            self.save = False

    # initialize variables that will be used in the training process
    def initialize_graph(self):
        # current price tensor
        self.price_his = tf.placeholder(dtype=tf.float32,
                                        shape=[
                                            None, self.asset_num - 1,
                                            self.history_length,
                                            self.feature_num
                                        ],
                                        name="ob")

        # price tensor of next step
        self.price_his_ = tf.placeholder(dtype=tf.float32,
                                         shape=[
                                             None, self.asset_num - 1,
                                             self.history_length,
                                             self.feature_num
                                         ],
                                         name="ob_")

        # weight vector of current step
        self.addi_inputs = tf.placeholder(dtype=tf.float32,
                                          shape=[None, self.asset_num],
                                          name='addi_inputs')

        # weight vector of next step
        self.addi_inputs_ = tf.placeholder(dtype=tf.float32,
                                           shape=[None, self.asset_num],
                                           name='addi_inputs_')

        # the actions chose by the DQN agent
        self.a = tf.placeholder(dtype=tf.int32, shape=[
            None,
        ], name='a')
        self.input_num = tf.placeholder(dtype=tf.int32, shape=[])

        # weight of each memory from the memory pool
        self.ISWeights = tf.placeholder(tf.float32, [None, 1],
                                        name='IS_weights')

        # Q-values of extimate net
        with tf.variable_scope('estm_net'):
            self.fc_input, self.q_pred = self.build_graph(
                self.price_his, self.addi_inputs, self.cnn_trainable)

        # Q-values of target net
        with tf.variable_scope('target_net'):
            _, self.tar_pred = self.build_graph(self.price_his_,
                                                self.addi_inputs_,
                                                self.cnn_trainable)

        # a holder to contain the target Q-value
        with tf.variable_scope('q_tar'):
            self.q_target = tf.placeholder(dtype=tf.float32,
                                           shape=[None],
                                           name='q_target')

        # select the largest estimate Q-values
        with tf.variable_scope('q_estm_wa'):
            a_indices = tf.stack(
                [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a],
                axis=1)
            self.q_estm_wa = tf.gather_nd(params=self.q_pred,
                                          indices=a_indices)

        # loss function
        with tf.name_scope('loss'):
            error = tf.abs(self.q_target - self.q_estm_wa)
            self.abs_errors = error
            square = tf.square(error)
            self.loss = tf.reduce_mean(self.ISWeights * square)

        # update the parameters of estimate Q-net
        with tf.name_scope('train'):
            self.optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=self.global_step)

    # network topology
    def build_graph(self, price_his, addi_input, trainable):
        kernels = self.network_config['kernels']
        strides = self.network_config['strides']
        filters = self.network_config['filters']
        fc1_size = self.network_config['fc1_size']

        # choose activate function
        def set_activation(activation):
            if activation == 'relu':
                activation = tf.nn.relu
            elif activation == 'selu':
                activation = tf.nn.selu
            else:
                activation = tf.nn.leaky_relu
            return activation

        cnn_activation = set_activation(self.network_config['cnn_activation'])
        w_initializer = tf.random_uniform_initializer(-0.05, 0.05)
        b_initializer = tf.constant_initializer(
            self.network_config['b_initializer'])
        regularizer = layers.l2_regularizer(self.network_config['regularizer'])

        conv = price_his

        # first cnn layer
        conv = tf.layers.conv2d(conv,
                                filters=filters[0],
                                kernel_size=kernels[0],
                                strides=strides[0],
                                trainable=trainable,
                                activation=cnn_activation,
                                kernel_regularizer=regularizer,
                                bias_regularizer=regularizer,
                                kernel_initializer=w_initializer,
                                bias_initializer=b_initializer,
                                padding='same',
                                name=self.name + 'conv' + str(0))

        # second cnn layer
        conv = tf.layers.conv2d(conv,
                                filters=filters[1],
                                kernel_size=kernels[1],
                                strides=strides[1],
                                trainable=trainable,
                                activation=cnn_activation,
                                kernel_regularizer=regularizer,
                                bias_regularizer=regularizer,
                                kernel_initializer=w_initializer,
                                bias_initializer=b_initializer,
                                padding='same',
                                name=self.name + 'conv' + str(1))

        # weight vector with the weight of cash removed
        addi_input1 = addi_input[:, 1:]

        # insert weight vector into the feature maps
        conv = tf.concat([conv, addi_input1[:, :, np.newaxis, np.newaxis]],
                         axis=3)

        # third cnn layer
        conv = tf.layers.conv2d(conv,
                                filters=filters[2],
                                kernel_size=kernels[2],
                                strides=strides[2],
                                trainable=trainable,
                                activation=cnn_activation,
                                kernel_regularizer=regularizer,
                                bias_regularizer=regularizer,
                                kernel_initializer=w_initializer,
                                bias_initializer=b_initializer,
                                padding='same',
                                name=self.name + 'conv' + str(2))

        cash_bias = tf.ones((self.input_num, 1))

        conv = tf.layers.flatten(conv)

        fc_input = tf.concat([cash_bias, conv], 1)

        fc1 = layers.fully_connected(fc_input,
                                     num_outputs=fc1_size,
                                     activation_fn=None,
                                     weights_initializer=w_initializer,
                                     trainable=True,
                                     scope=self.name + 'fc1')

        output_state = layers.fully_connected(
            fc1,
            num_outputs=1,
            activation_fn=None,
            weights_initializer=w_initializer,
            trainable=True,
            scope=self.name + 'output_state')

        output_action = layers.fully_connected(
            fc1,
            num_outputs=self.action_num,
            activation_fn=None,
            weights_initializer=w_initializer,
            trainable=True,
            scope=self.name + 'output_action')

        output = output_state + (output_action - tf.reduce_mean(
            output_action, axis=1, keep_dims=True))

        return fc_input, output

    def replay(self):

        obs, action_batch, reward_batch, obs_, tree_idx, ISWeights = self.memory.sample(
        )

        q_values_next = self.sess.run(self.q_pred,
                                      feed_dict={
                                          self.price_his: obs_['history'],
                                          self.addi_inputs: obs_['weights'],
                                          self.input_num:
                                          obs_['history'].shape[0]
                                      })

        best_actions = np.argmax(q_values_next, axis=1)

        q_values_next_target = self.sess.run(self.tar_pred,
                                             feed_dict={
                                                 self.price_his_:
                                                 obs_['history'],
                                                 self.addi_inputs_:
                                                 obs_['weights'],
                                                 self.input_num:
                                                 obs_['history'].shape[0]
                                             })

        targets_batch = reward_batch + self.gamma * q_values_next_target[
            np.arange(len(action_batch)), best_actions]

        fd = {
            self.q_target: targets_batch,
            self.price_his: obs['history'],
            self.addi_inputs: obs['weights'],
            self.a: action_batch,
            self.input_num: obs['history'].shape[0],
            self.ISWeights: ISWeights
        }

        _, abs_errors, global_step = self.sess.run(
            [self.train_op, self.abs_errors, self.global_step], feed_dict=fd)

        self.memory.batch_update(tree_idx, abs_errors)

        if global_step % self.update_tar_period == 0:
            self.sess.run(self.update_target)

        if self.save and global_step % self.save_period == 0:
            self.saver.save(self.sess,
                            abspath + 'logs/checkpoint/' + self.name,
                            global_step=global_step)

        if self.epsilon > self.epsilon_min:
            self.epsilon -= (1 - self.epsilon_min) / self.epsilon_decay_period

    def choose_action(self, observation, test):
        def action_max():
            fc_input, action_values = self.sess.run(
                [self.fc_input, self.q_pred],
                feed_dict={
                    self.price_his:
                    observation['history'][np.newaxis, :, :, :],
                    self.addi_inputs: observation['weights'][np.newaxis, :],
                    self.input_num: 1
                })
            return np.argmax(action_values), fc_input

        if not test:
            if np.random.rand() > self.epsilon:
                action_idx, fc_input = action_max()
            else:
                action_idx = np.random.randint(0, self.action_num)
                action_idx_, fc_input = action_max()
        else:
            action_idx, fc_input = action_max()

        action_weights = self.actions[action_idx]

        return action_idx, action_weights, fc_input

    def store(self, ob, a, r, ob_):
        self.memory.store(ob, a, r, ob_)

    def get_training_step(self):
        a = self.sess.run(self.global_step)
        return a

    def restore(self, name):
        self.saver.restore(self.sess, abspath + 'logs/checkpoint/' + name)

    def start_replay(self):
        return self.memory.start_replay()

    def memory_cnt(self):
        return self.memory.tree.data_pointer