def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder( dtype=tf.float32, shape=[None, self.replay_memory.history_length, self.replay_memory.num_channels], name=HISTORICAL_PRICES ) self.trade_rem_t = tf.placeholder( dtype=tf.float32, shape=[None,], name=TRADE_REM ) with tf.variable_scope(DROPOUT_KEEP_PROBS): self.q_conv_keep_prob = tf.placeholder(tf.float32) self.q_dense_keep_prob = tf.placeholder(tf.float32) self.q_gru_keep_prob = tf.placeholder(tf.float32) params.dropoutkeepprobs = DropoutKeepProbs( self.q_conv_keep_prob, self.q_dense_keep_prob, self.q_gru_keep_prob ) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model((self.s_t, self.trade_rem_t)) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder( dtype=tf.float32, shape=[None, self.replay_memory.history_length, self.replay_memory.num_channels], name=HISTORICAL_PRICES ) self.t_trade_rem_t = tf.placeholder( dtype=tf.float32, shape=[None,], name=TRADE_REM ) params.dropoutkeepprobs = DropoutKeepProbs() self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model((self.t_s_t, self.t_trade_rem_t)) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list() ) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[name].assign( self.q_weights_placeholders[name] ) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.config[NUM_ACTIONS], 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder(tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum(self.learning_rate_minimum, tf.train.exponential_decay( self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope(SUMMARY): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', \ 'episode.num of episodes', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.scalar( name="{}-{}".format(self.env_name, tag.replace(' ', '_')), tensor=self.summary_placeholders[tag] ) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = \ tf.summary.histogram( tag, self.summary_placeholders[tag] ) self.sess.run(tf.local_variables_initializer()) self.sess.run(tf.global_variables_initializer()) self._saver = tf.train.Saver(self.q.weights.values() + [self.step_op], max_to_keep=30) self.load_model() self.update_target_network() self._summary_writer = tf.summary.FileWriter(self.config[TENSORBOARD_LOG_DIR]) self._summary_writer.add_graph(self.sess.graph)
def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model(self.s_t) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model(self.t_s_t, train=False) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list()) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[ name].assign(self.q_weights_placeholders[name]) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder( tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope(SUMMARY): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', \ 'episode.num of game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.scalar( name="{}-{}".format(self.env_name, tag), tensor=self.summary_placeholders[tag] ) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.histogram( name=tag, self.summary_placeholders[tag] ) self._summary_writer = tf.summary.FileWriter( config[TENSORBOARD_LOG_DIR]) self._summary_writer.add_graph(sess.graph) tf.initialize_all_variables().run() self._saver = tf.train.Saver(self.q.weights.values + [self.step_op], max_to_keep=30) self.load_model() self.update_target_network()