コード例 #1
0
 def __init__(self, access, name, observation, action_size):
     self.Access = access
     self.AC = Framework(self.Access, observation, action_size, name)
     self.env = Account()
     self.name = name
コード例 #2
0
class ExplorerFramework(object):
    def __init__(self, access, name, observation, action_size):
        self.Access = access
        self.AC = Framework(self.Access, observation, action_size, name)
        self.env = Account()
        self.name = name

    def get_bootstrap(self, done, sess, next_state):
        if done:
            terminal = 0
        else:
            terminal = self.AC.get_value(
                sess, np.expand_dims(next_state, axis=0))[0][0]
        return terminal

    def get_output(self, sess, inputs, actions, targets):
        return self.AC.get_losses(sess, inputs, actions, targets)

    def run(self, sess, max_episodes, t_max=32):
        episode = 0
        while episode < max_episodes:
            episode += 1
            _ = self.run_episode(sess, t_max)

    def run_episode(self, sess, t_max=32):
        t_start = t = 0
        episode_score = 0
        buffer_state = []
        buffer_action = []
        buffer_reward = []

        self.AC.init_network(sess)
        state = self.env.reset()
        while True:
            t += 1
            action = self.AC.get_stochastic_action(sess, state)
            reward, next_state, done = self.env.step(action)
            # buffer for loop
            episode_score += reward
            buffer_state.append(state)
            buffer_action.append(action)
            buffer_reward.append(reward)
            state = next_state

            if t - t_start == t_max or done:
                t_start = t
                terminal = self.get_bootstrap(done, sess, next_state)

                buffer_target = []
                for r in buffer_reward[::-1]:
                    terminal = r + GAMMA * terminal
                    buffer_target.append(terminal)
                buffer_target.reverse()

                inputs = np.stack(buffer_state, axis=0)
                actions = np.squeeze(np.vstack(buffer_action), axis=1)
                targets = np.squeeze(np.vstack(buffer_target), axis=1)
                buffer_state = []
                buffer_action = []
                buffer_reward = []
                # update Access gradients
                self.AC.train_step(sess, inputs, actions, targets)

                # update local network
                self.AC.init_network(sess)

            if done or t > MAX_EPISODE_LENGTH:
                if self.name == 'W0':
                    outputs = tuple(self.get_output(sess, inputs, actions, targets))
                    print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, actor_norm: %f, '
                          'critic: %f, critic_grad: %f, value: %f, critic_norm: %f, value_mean: %f, advantage: %f'
                          % outputs)
                return episode_score
コード例 #3
0
ファイル: main.py プロジェクト: zzwlstarby/DeepLearningNotes
 def __init__(self, name, access, batch_size, state_size, action_size):
     self.Access = access
     self.AC = Framework(name, self.Access, batch_size, state_size, action_size)
     self.env = Account()
     self.name = name
コード例 #4
0
import numpy as np
from emulator.main import Account
from agent.agent import Agent

env = Account()
state = env.reset()
print(state.shape)
agent = Agent([5, 50, 58], 3)

# state = np.transpose(state, [2, 0, 1])
# state = np.expand_dims(state, 0)
# action = agent.get_epsilon_policy(state)
# reward, next_state, done = env.step(action)
# print(reward)

for i in range(1440):
    state = np.transpose(state, [2, 0, 1])
    state = np.expand_dims(state, 0)
    action = agent.get_epsilon_policy(state)
    reward, state, done = env.step(action)
    print(done, reward)
    if done:
        state = env.reset()
        break
コード例 #5
0
ファイル: main.py プロジェクト: zzwlstarby/DeepLearningNotes
class Agent(object):
    def __init__(self, name, access, batch_size, state_size, action_size):
        self.Access = access
        self.AC = Framework(name, self.Access, batch_size, state_size, action_size)
        self.env = Account()
        self.name = name

    def run(self, sess, max_episodes, t_max=8):
        buffer_score = []
        buffer_loss = []
        episode = 0
        while episode < max_episodes:
            episode += 1
            episode_score, outputs = self.run_episode(sess, t_max)
            buffer_score.append(episode_score)
            buffer_loss.append(outputs)
        return buffer_score, buffer_loss

    def run_episode(self, sess, t_max=8):
        t_start = t = 0
        episode_score = 1
        buffer_state = []
        buffer_action = []
        buffer_reward = []

        self.AC.init_or_update_local(sess)
        state = self.env.reset()
        while True:
            t += 1
            action = self.AC.get_stochastic_action(sess, state)
            next_state, reward, done = self.env.step(action)

            # buffer for loop
            episode_score *= (1 + reward / 100)
            buffer_state.append(state)
            buffer_action.append(action)
            buffer_reward.append(reward)
            state = next_state

            if t - t_start == t_max or done:
                t_start = t
                terminal = self.get_bootstrap(sess, next_state, done)

                buffer_target = []
                for r in buffer_reward[::-1]:
                    terminal = r + GAMMA * terminal
                    buffer_target.append(terminal)
                buffer_target.reverse()

                # stack
                inputs, gather_list = batch_stack(buffer_state)
                actions = np.vstack(buffer_action)
                targets = np.squeeze(np.vstack(buffer_target), axis=1)

                # empty buffer
                buffer_state = []
                buffer_action = []
                buffer_reward = []

                # update Access gradients
                self.AC.train_step(sess, inputs, actions, targets, gather_list)

                # update local network
                self.AC.init_or_update_local(sess)

            if done or t > MAX_EPISODE_LENGTH:
                outputs = self.get_losses(sess, inputs, actions, targets, gather_list)
                outputs = tuple(outputs)
                if self.name == 'W0':
                    print('actor: %f, actor_grad: %f, policy mean: %f, policy: %f, entropy: %f, '
                          'critic: %f, critic_grad: %f, value: %f, value_mean: %f, advantage: %f'
                          % outputs)
                return episode_score, outputs

    def get_bootstrap(self, sess, next_state, done):
        if done:
            terminal = 0
        else:
            terminal = self.AC.get_step_value(sess, next_state)
        return terminal

    def get_losses(self, sess, inputs, actions, targets, gather_list):
        return self.AC.get_losses(sess, inputs, actions, targets, gather_list)
コード例 #6
0
import numpy as np
from emulator.main import Account

A = Account()
state = A.reset()
for i in range(1440):
    action = np.random.randint(0, 3)
    reward, next_state, done = A.step(action)
    print(reward)
コード例 #7
0
import numpy as np
from emulator.main import Account

A = Account()
A.reset()
for i in range(200):
    order = np.random.randint(0, 3, 50)
    A.step(order)
    print(A.quote.total_value)
コード例 #8
0
# sess.run(tf.global_variables_initializer())
#
#
# while True:
#     action = agent.get_stochastic_policy(sess, state)
#     next_state, reward, done = env.step(action)
#     agent.update_cache(state, action, reward, next_state, done)
#     state = next_state
#     if done:
#         break
#
# agent.update_value_net(sess)
# agent.update_target_net(sess)


env = Account()
state = env.reset()

agent = Agent()
while True:
    action = agent.get_stochastic_policy(state)
    next_state, reward, done = env.step(action)
    agent.update_cache(state, action, reward, next_state, done)
    state = next_state
    if done:
        break

agent.update_target()
agent.update_eval()
agent.save_model()
agent.restore_model()
コード例 #9
0
import numpy as np
from emulator.main import Account


A = Account()
A.reset()
for i in range(1000):
    action = np.random.randint(0, 3, 1)
    A.step(action)