class Agent(BaseAgent): '''Deep Trading Agent based on Deep Q Learning''' '''TODO: 1. add `play` function to run tests in the simulated environment ''' def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config, logger) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params) @property def summary_writer(self): return self._summary_writer def train(self): start_step = self.sess.run(self.step_op) num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] trade_rem = self.env.new_random_episode(self.history, self.replay_memory) for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict((self.history.history, trade_rem)) # 2. act screen, reward, terminal, trade_rem = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal, trade_rem) if terminal: self.env.new_random_episode(self.history, self.replay_memory) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learn_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_episodes) self.logger.info(message) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.sess.run( fetches=self.step_assign_op, feed_dict={self.step_input: self.step + 1} ) self.save_model(self.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if self.step > 180: self.inject_summary({ 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of episodes': num_episodes, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': self.sess.run( fetches=self.learning_rate_op, feed_dict={self.learning_rate_step: self.step} ) }, self.step) num_episodes = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def predict(self, state, test_ep=None): s_t = state[0] trade_rem_t = state[1] ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) \ * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.config[NUM_ACTIONS]) else: action = self.sess.run( fetches=self.q.action, feed_dict={ self.q.phase: 0, self.s_t: [s_t], self.trade_rem_t: [trade_rem_t], self.q_conv_keep_prob: 1.0, self.q_dense_keep_prob: 1.0, self.q_gru_keep_prob: 1.0 } )[0] return action def observe(self, screen, reward, action, terminal, trade_rem): #clip reward in the range min to max reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.replay_memory.add(screen, reward, action, terminal, trade_rem) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_network() def q_learning_mini_batch(self): if self.replay_memory.count >= self.replay_memory.history_length: state_t, action, reward, state_t_plus_1, terminal = self.replay_memory.sample s_t, trade_rem_t = state_t[0], state_t[1] s_t_plus_1, trade_rem_t_plus_1 = state_t_plus_1[0], state_t_plus_1[1] q_t_plus_1 = self.sess.run( fetches=self.t_q.values, feed_dict={ self.t_q.phase: 0, self.t_s_t: s_t_plus_1, self.t_trade_rem_t: trade_rem_t_plus_1 } ) max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) terminal = np.array(terminal) + 0. target_q = reward + (1 - terminal) * max_q_t_plus_1 _, q_t, loss, avg_q_summary = self.sess.run([self.optimizer, self.q.values, self.loss, self.q.avg_q_summary], { self.q.phase: 1, self.target_q: target_q, self.action: action, self.s_t: s_t, self.trade_rem_t: trade_rem_t, self.q_conv_keep_prob: self.config[CONV_KEEP_PROB], self.q_dense_keep_prob: self.config[DENSE_KEEP_PROB], self.q_gru_keep_prob: self.config[GRU_KEEP_PROB], self.learning_rate_step: self.step }) self.summary_writer.add_summary(avg_q_summary, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder( dtype=tf.float32, shape=[None, self.replay_memory.history_length, self.replay_memory.num_channels], name=HISTORICAL_PRICES ) self.trade_rem_t = tf.placeholder( dtype=tf.float32, shape=[None,], name=TRADE_REM ) with tf.variable_scope(DROPOUT_KEEP_PROBS): self.q_conv_keep_prob = tf.placeholder(tf.float32) self.q_dense_keep_prob = tf.placeholder(tf.float32) self.q_gru_keep_prob = tf.placeholder(tf.float32) params.dropoutkeepprobs = DropoutKeepProbs( self.q_conv_keep_prob, self.q_dense_keep_prob, self.q_gru_keep_prob ) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model((self.s_t, self.trade_rem_t)) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder( dtype=tf.float32, shape=[None, self.replay_memory.history_length, self.replay_memory.num_channels], name=HISTORICAL_PRICES ) self.t_trade_rem_t = tf.placeholder( dtype=tf.float32, shape=[None,], name=TRADE_REM ) params.dropoutkeepprobs = DropoutKeepProbs() self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model((self.t_s_t, self.t_trade_rem_t)) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list() ) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[name].assign( self.q_weights_placeholders[name] ) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.config[NUM_ACTIONS], 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder(tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum(self.learning_rate_minimum, tf.train.exponential_decay( self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope(SUMMARY): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', \ 'episode.num of episodes', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.scalar( name="{}-{}".format(self.env_name, tag.replace(' ', '_')), tensor=self.summary_placeholders[tag] ) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = \ tf.summary.histogram( tag, self.summary_placeholders[tag] ) self.sess.run(tf.local_variables_initializer()) self.sess.run(tf.global_variables_initializer()) self._saver = tf.train.Saver(self.q.weights.values() + [self.step_op], max_to_keep=30) self.load_model() self.update_target_network() self._summary_writer = tf.summary.FileWriter(self.config[TENSORBOARD_LOG_DIR]) self._summary_writer.add_graph(self.sess.graph) def update_target_network(self): for name in self.q.weights.keys(): self.sess.run( fetches=self.t_weights_assign_ops[name], feed_dict= {self.q_weights_placeholders[name]: self.sess.run( fetches=self.q.weights[name] )} ) def inject_summary(self, tag_dict, step): summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.summary_writer.add_summary(summary_str, self.step)
class UserFile: DATA_FILE = os.path.expanduser("~/.cache/.wfclidata") root_node_id = "0" # SETUP METHODS def __init__(self): self.nds = NodeStore() self.cursor_y = 0 self._load_data() self._update = True self.history = History(seed=self.nds) # Decorator for funtions that need to force an update to our tree def update_visible_after(func): def do_update(self, *args, **kwargs): result = func(self, *args, **kwargs) self._update = True return result return do_update def update_visible_now(self): self._update = True def set_cursor_to_node(self, node_id): self.update_visible_now() for i, v in enumerate(self.visible): if v[0].uuid == node_id: self.cursor_y = i def current_node(self, depth=False): node_pair = self.visible[self.cursor_y] if depth: return node_pair else: return node_pair[0] # FILE METHODS def data_from_file_object(self, fo): data = json.load(fo) for node_def in data: node = Node(node_def=node_def) self.nds.add_node(node) @classmethod def _write_data_file(cls, data_obj): os.makedirs(os.path.dirname(cls.DATA_FILE), exist_ok=True) with open(cls.DATA_FILE, "x+") as f: json.dump(data_obj, f, indent=2) @classmethod def _create_empty_data_file(cls): empty_data = [ {"id": cls.root_node_id, "pa": None, "ch": ["1"]}, {"pa": cls.root_node_id, "id": "1", "nm": "Write down your thoughts"}, ] cls._write_data_file(empty_data) def _load_data(self): if not os.path.exists(self.DATA_FILE): self._create_empty_data_file() with open(self.DATA_FILE) as f: self.data_from_file_object(f) def save(self): with open(self.DATA_FILE, "w") as f: json.dump(self.nds.flat_format, f, indent=2) def commit(self): self.history.add(self.nds) # TREE TRAVERSAL @property def visible(self): if self._update: self._update = False return self.load_visible() else: return self._visible def load_visible(self): """ returns a list of tuples like this ( node, depth,) """ self._visible = [] for node in self.nds.get_node(self.root_node_id).children: if node is not None: self._traverse_node(node, 0) return self._visible def _traverse_node(self, node, depth): current_node = self.nds.get_node(node) self._visible.append((current_node, depth)) if not current_node.closed: for child in current_node.children: self._traverse_node(child, depth + 1) # NAVIGATION METHODS def nav_up(self): if self.cursor_y > 0: self.cursor_y -= 1 def nav_down(self): if self.cursor_y < len(self.visible) - 1: self.cursor_y += 1 def bottom(self): self.cursor_y = len(self.visible) - 1 def top(self): self.cursor_y = 0 # LINKING METHODS @update_visible_after def link_parent_child(self, parent, child, position=-1): self.nds.get_node(child).parent = parent if position >= 0: self.nds.get_node(parent).children.insert(position, child) else: self.nds.get_node(parent).children.append(child) @update_visible_after def unlink_relink(self, old_parent, child, new_parent, position): def unlink_parent_child(self, parent, child): assert child in self.nds assert parent in self.nds assert self.nds.get_node(child).parent == parent assert child in self.nds.get_node(parent).children self.nds.get_node(parent).children.remove(child) self.nds.get_node(child).parent = None unlink_parent_child(self, old_parent, child) self.link_parent_child(new_parent, child, position) # MANIPULATE NODES @update_visible_after def indent(self): current_node = self.current_node() parent_node = current_node.parent parents_child_list = self.nds.get_node(parent_node).children current_node_index = parents_child_list.index(current_node.uuid) if current_node_index == 0: raise ModelException("Indent of top child") else: new_parent = parents_child_list[current_node_index - 1] self.unlink_relink(parent_node, current_node.uuid, new_parent, -1) self.nds.get_node(new_parent).closed = False log.info("Nailed it") @update_visible_after def unindent(self): current_node = self.current_node() parent_id = current_node.parent if parent_id == self.root_node_id: raise ModelException("top level, can't unindent") else: super_parent_node = self.nds.get_node(self.nds.get_node(parent_id).parent) pos_in_parent_list = super_parent_node.children.index(parent_id) self.unlink_relink( parent_id, current_node.uuid, super_parent_node.uuid, pos_in_parent_list + 1, ) log.info("nailed it") @update_visible_after def open_above(self): current_node = self.current_node() parent_node = self.nds.get_node(current_node.parent) new_node = self.create_node(parent_node.uuid) pos_in_parent_list = parent_node.children.index(current_node.uuid) self.link_parent_child( parent_node.uuid, new_node.uuid, pos_in_parent_list, ) @update_visible_after def open_below(self): current_node = self.current_node() if current_node.state == "open": new_node = self.create_node(current_node.uuid) self.link_parent_child( current_node.uuid, new_node.uuid, 0, ) else: # new node is sibling of current node parent_node = self.nds.get_node(current_node.parent) new_node = self.create_node(parent_node.uuid) pos_in_parent_list = parent_node.children.index(current_node.uuid) self.link_parent_child( parent_node.uuid, new_node.uuid, pos_in_parent_list + 1, ) @update_visible_after def delete_item(self, node_id=None): current_node = self.current_node() if node_id is None else self.nds.get_node(node_id) for child_id in current_node.children[:]: self.delete_item(node_id=child_id) parent_id = current_node.parent self.nds.get_node(parent_id).children.remove(current_node.uuid) del self.nds[current_node.uuid] if node_id is None: # this is our top-level delete self.cursor_y = max(0, self.cursor_y - 1) if len(self.nds.get_node(self.root_node_id).children) == 0: new_node = self.create_node( self.root_node_id, nm="Ooops, you deleted the last item on the list", ) self.link_parent_child( self.root_node_id, new_node.uuid, 0, ) @update_visible_after def move_down(self): current_node = self.current_node() parent_id = current_node.parent parents_child_list = self.nds.get_node(parent_id).children current_node_index = parents_child_list.index(current_node.uuid) if current_node_index < len(parents_child_list) - 1: # swap with the one behind parents_child_list[current_node_index] = parents_child_list[current_node_index + 1] parents_child_list[current_node_index + 1] = current_node.uuid self.set_cursor_to_node(current_node.uuid) @update_visible_after def move_up(self): current_node = self.current_node() parent_id = current_node.parent parents_child_list = self.nds.get_node(parent_id).children current_node_index = parents_child_list.index(current_node.uuid) if current_node_index > 0: # swap with the one behind parents_child_list[current_node_index] = parents_child_list[current_node_index - 1] parents_child_list[current_node_index - 1] = current_node.uuid self.set_cursor_to_node(current_node.uuid) @update_visible_after def complete(self): current_node = self.current_node() current_node.complete = not current_node.complete @update_visible_after def create_node(self, parent, **kwargs): node = Node(pa=parent, **kwargs) self.nds.add_node(node) return node @update_visible_after def collapse_node(self): self.visible[self.cursor_y][0].closed = True @update_visible_after def expand_node(self): self.visible[self.cursor_y][0].closed = False @update_visible_after def undo(self): ret = self.history.undo() if ret is not None: self.nds = ret @update_visible_after def redo(self): ret = self.history.redo() if ret is not None: self.nds = ret # EDIT TEXT @update_visible_after def add_char(self, char, cursor_x): current_node = self.current_node() name = current_node.name[0:cursor_x] + char + current_node.name[cursor_x:] current_node.name = name @update_visible_after def delete_char(self, num, cursor_x): current_node = self.current_node() if cursor_x > 0: name = current_node.name[0:cursor_x - num] + current_node.name[cursor_x:] current_node.name = name
class Agent(BaseAgent): '''Deep Trading Agent based on Deep Q Learning''' '''TODO: 1. play ''' def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config, logger) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params) @property def summary_writer(self): return self._summary_writer def train(self): start_step = self.step_op.eval() num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] self.env.new_random_episode(self.history) for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict(self.history.get()) # 2. act screen, reward, terminal = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal) if terminal: self.env.new_random_episode(self.history) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learn_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game) print_and_log_message(message, self.logger) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.step_assign_op.eval( {self.step_input: self.step + 1}) self.save_model(self.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if self.step > 180: self.inject_summary( { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': self.learning_rate_op.eval( {self.learning_rate_step: self.step}), }, self.step) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def predict(self, s_t, test_ep=None): ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) \ * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.env.action_size) else: action = self.q.action.eval({self.s_t: [s_t]})[0] return action def observe(self, screen, reward, action, terminal): #clip reward in the range min to max reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.replay_memory.add(screen, reward, action, terminal) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_network() def q_learning_mini_batch(self): if self.replay_memory.count >= self.replay_memory.history_length: s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample( ) max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1}) terminal = np.array(terminal) + 0. target_q = reward + (1 - terminal) * max_q_t_plus_1 _, q_t, loss, avg_q_summary = self.sess.run( [ self.optimizer, self.q.values, self.loss, self.q.avg_q_summary ], { self.target_q: target_q, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step, }) self.summary_writer.add_summary(avg_q_summary, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model(self.s_t) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model(self.t_s_t, train=False) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list()) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[ name].assign(self.q_weights_placeholders[name]) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder( tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope(SUMMARY): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', \ 'episode.num of game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.scalar( name="{}-{}".format(self.env_name, tag), tensor=self.summary_placeholders[tag] ) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.histogram( name=tag, self.summary_placeholders[tag] ) self._summary_writer = tf.summary.FileWriter( config[TENSORBOARD_LOG_DIR]) self._summary_writer.add_graph(sess.graph) tf.initialize_all_variables().run() self._saver = tf.train.Saver(self.q.weights.values + [self.step_op], max_to_keep=30) self.load_model() self.update_target_network() def update_target_network(self): for name in self.q.weights.keys(): self.t_weights_assign_ops[name].eval({ self.q_weights_placeholders[name]: self.q.weights[name].eval() }) def inject_summary(self, tag_dict, step): summary_str_lists = self.sess.run( [self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.writer.add_summary(summary_str, self.step)
class Agent(BaseAgent): '''Deep Trading Agent based on Deep Q Learning''' '''TODO: 1. add summary ops 2. timing and logging 3. model saving 4. increment self.step ''' def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params) def train(self): start_step = self.step_op.eval() num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] self.env.new_random_episode(self.history) for self.step in range(start_step, self.max_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict(self.history.get()) # 2. act screen, reward, terminal = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal) if terminal: self.env.new_random_episode(self.history) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward def predict(self, s_t, test_ep=None): ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) \ * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.env.action_size) else: action = self.q.action.eval({self.s_t: [s_t]})[0] return action def observe(self, screen, reward, action, terminal): #clip reward in the range min to max reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.replay_memory.add(screen, reward, action, terminal) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_network() def q_learning_mini_batch(self): if self.replay_memory.count >= self.replay_memory.history_length: s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample( ) max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1}) terminal = np.array(terminal) + 0. target_q = reward + (1 - terminal) * max_q_t_plus_1 _, q_t, loss = self.sess.run( [self.optimizer, self.q.values, self.loss], { self.target_q: target_q, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step, }) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model(self.s_t) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model(self.t_s_t, train=False) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list()) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[ name].assign(self.q_weights_placeholders[name]) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder( tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) # tf.initialize_all_variables().run() #initialize the q network and the target network with the same weights # self.update_target_network() def update_target_network(self): for name in self.q.weights.keys(): self.t_weights_assign_ops[name].eval({ self.q_weights_placeholders[name]: self.q.weights[name].eval() })