def __init__(self, access, name, observation, action_size): self.Access = access self.AC = Framework(self.Access, observation, action_size, name) self.env = Account() self.name = name
class ExplorerFramework(object): def __init__(self, access, name, observation, action_size): self.Access = access self.AC = Framework(self.Access, observation, action_size, name) self.env = Account() self.name = name def get_bootstrap(self, done, sess, next_state): if done: terminal = 0 else: terminal = self.AC.get_value( sess, np.expand_dims(next_state, axis=0))[0][0] return terminal def get_output(self, sess, inputs, actions, targets): return self.AC.get_losses(sess, inputs, actions, targets) def run(self, sess, max_episodes, t_max=32): episode = 0 while episode < max_episodes: episode += 1 _ = self.run_episode(sess, t_max) def run_episode(self, sess, t_max=32): t_start = t = 0 episode_score = 0 buffer_state = [] buffer_action = [] buffer_reward = [] self.AC.init_network(sess) state = self.env.reset() while True: t += 1 action = self.AC.get_stochastic_action(sess, state) reward, next_state, done = self.env.step(action) # buffer for loop episode_score += reward buffer_state.append(state) buffer_action.append(action) buffer_reward.append(reward) state = next_state if t - t_start == t_max or done: t_start = t terminal = self.get_bootstrap(done, sess, next_state) buffer_target = [] for r in buffer_reward[::-1]: terminal = r + GAMMA * terminal buffer_target.append(terminal) buffer_target.reverse() inputs = np.stack(buffer_state, axis=0) actions = np.squeeze(np.vstack(buffer_action), axis=1) targets = np.squeeze(np.vstack(buffer_target), axis=1) buffer_state = [] buffer_action = [] buffer_reward = [] # update Access gradients self.AC.train_step(sess, inputs, actions, targets) # update local network self.AC.init_network(sess) if done or t > MAX_EPISODE_LENGTH: if self.name == 'W0': outputs = tuple(self.get_output(sess, inputs, actions, targets)) print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, actor_norm: %f, ' 'critic: %f, critic_grad: %f, value: %f, critic_norm: %f, value_mean: %f, advantage: %f' % outputs) return episode_score
def __init__(self, name, access, batch_size, state_size, action_size): self.Access = access self.AC = Framework(name, self.Access, batch_size, state_size, action_size) self.env = Account() self.name = name
import numpy as np from emulator.main import Account from agent.agent import Agent env = Account() state = env.reset() print(state.shape) agent = Agent([5, 50, 58], 3) # state = np.transpose(state, [2, 0, 1]) # state = np.expand_dims(state, 0) # action = agent.get_epsilon_policy(state) # reward, next_state, done = env.step(action) # print(reward) for i in range(1440): state = np.transpose(state, [2, 0, 1]) state = np.expand_dims(state, 0) action = agent.get_epsilon_policy(state) reward, state, done = env.step(action) print(done, reward) if done: state = env.reset() break
class Agent(object): def __init__(self, name, access, batch_size, state_size, action_size): self.Access = access self.AC = Framework(name, self.Access, batch_size, state_size, action_size) self.env = Account() self.name = name def run(self, sess, max_episodes, t_max=8): buffer_score = [] buffer_loss = [] episode = 0 while episode < max_episodes: episode += 1 episode_score, outputs = self.run_episode(sess, t_max) buffer_score.append(episode_score) buffer_loss.append(outputs) return buffer_score, buffer_loss def run_episode(self, sess, t_max=8): t_start = t = 0 episode_score = 1 buffer_state = [] buffer_action = [] buffer_reward = [] self.AC.init_or_update_local(sess) state = self.env.reset() while True: t += 1 action = self.AC.get_stochastic_action(sess, state) next_state, reward, done = self.env.step(action) # buffer for loop episode_score *= (1 + reward / 100) buffer_state.append(state) buffer_action.append(action) buffer_reward.append(reward) state = next_state if t - t_start == t_max or done: t_start = t terminal = self.get_bootstrap(sess, next_state, done) buffer_target = [] for r in buffer_reward[::-1]: terminal = r + GAMMA * terminal buffer_target.append(terminal) buffer_target.reverse() # stack inputs, gather_list = batch_stack(buffer_state) actions = np.vstack(buffer_action) targets = np.squeeze(np.vstack(buffer_target), axis=1) # empty buffer buffer_state = [] buffer_action = [] buffer_reward = [] # update Access gradients self.AC.train_step(sess, inputs, actions, targets, gather_list) # update local network self.AC.init_or_update_local(sess) if done or t > MAX_EPISODE_LENGTH: outputs = self.get_losses(sess, inputs, actions, targets, gather_list) outputs = tuple(outputs) if self.name == 'W0': print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, ' 'critic: %f, critic_grad: %f, value: %f, value_mean: %f, advantage: %f' % outputs) return episode_score, outputs def get_bootstrap(self, sess, next_state, done): if done: terminal = 0 else: terminal = self.AC.get_step_value(sess, next_state) return terminal def get_losses(self, sess, inputs, actions, targets, gather_list): return self.AC.get_losses(sess, inputs, actions, targets, gather_list)
import numpy as np from emulator.main import Account A = Account() state = A.reset() for i in range(1440): action = np.random.randint(0, 3) reward, next_state, done = A.step(action) print(reward)
import numpy as np from emulator.main import Account A = Account() A.reset() for i in range(200): order = np.random.randint(0, 3, 50) A.step(order) print(A.quote.total_value)
# sess.run(tf.global_variables_initializer()) # # # while True: # action = agent.get_stochastic_policy(sess, state) # next_state, reward, done = env.step(action) # agent.update_cache(state, action, reward, next_state, done) # state = next_state # if done: # break # # agent.update_value_net(sess) # agent.update_target_net(sess) env = Account() state = env.reset() agent = Agent() while True: action = agent.get_stochastic_policy(state) next_state, reward, done = env.step(action) agent.update_cache(state, action, reward, next_state, done) state = next_state if done: break agent.update_target() agent.update_eval() agent.save_model() agent.restore_model()
import numpy as np from emulator.main import Account A = Account() A.reset() for i in range(1000): action = np.random.randint(0, 3, 1) A.step(action)