Beispiel #1
0
class Worker(object):
    def __init__(self, wid):
        trading_fee = .007
        time_fee = .0073
        history_length = 1
        #self.env = gym.make(GAME).unwrapped
        generator = get_CSV_data(filename="./test_6.csv")
        self.env = SpreadTrading(spread_coefficients=[1],
                                 data_generator=generator,
                                 trading_fee=trading_fee,
                                 time_fee=time_fee,
                                 history_length=history_length)
        self.wid = wid
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            #print("=======")
            #print(s)
            #print("========")
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                #print("=========")
                #print("a: ", a)
                #print("=========")
                s_, r, done, _ = self.env.step(a)
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(
                    (r + 8) / 8)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
from tgym.core import DataGenerator
from tgym.envs import SpreadTrading
from tgym.gens.deterministic import WavySignal

generator = WavySignal(period_1=25, period_2=50, epsilon=-0.5)

game_length = 200
trading_fee = 0.2
time_fee = 0
# history_length number of historical states in the observation vector.
history_length = 2

environment = SpreadTrading(spread_coefficients=[1],
                            data_generator=generator,
                            trading_fee=trading_fee,
                            time_fee=time_fee,
                            history_length=history_length,
                            game_length=game_length)

environment.render()
while True:
    action = raw_input("Action: Buy (b) / Sell (s) / Hold (enter): ")
    if action == 'b':
        action = [0, 1, 0]
    elif action == 's':
        action = [0, 0, 1]
    else:
        action = [1, 0, 0]
    environment.step(action)
    environment.render()
Beispiel #3
0
    for worker in workers:  # worker threads
        t = threading.Thread(target=worker.work, args=())
        t.start()  # training
        threads.append(t)
    # add a PPO updating thread
    threads.append(threading.Thread(target=GLOBAL_PPO.update, ))
    threads[-1].start()
    COORD.join(threads)

    # plot reward change and test
    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
    plt.xlabel('Episode')
    plt.ylabel('Moving reward')
    plt.ion()
    plt.show()
    #env = gym.make('Pendulum-v0')
    trading_fee = .007
    time_fee = .00724
    history_length = 1
    generator = get_CSV_data(filename="./test_6.csv")
    env = SpreadTrading(spread_coefficients=[1],
                        data_generator=generator,
                        trading_fee=trading_fee,
                        time_fee=time_fee,
                        history_length=history_length)
    while True:
        s = env.reset()
        for t in range(3455):
            env.render()
            s = env.step(GLOBAL_PPO.choose_action(s))[0]
Beispiel #4
0
             state = np.reshape(observation, [1, state_size])    
 
             while current_step < max_steps:
                 
                 current_step += 1; decay_step += 1
                 
                 action, explore_probability = exploit_explore(session=sess,
                                                               model=model,
                                                               explore_start=explore_start, 
                                                               explore_stop=explore_stop, 
                                                               decay_rate=decay_rate, 
                                                               decay_step=decay_step, 
                                                               state=state, 
                                                               actions=possible_actions)
                 
                 state, reward, done, info = environment.step(action)
                 reward_sum.append(reward)
                 
                 if current_step >= max_steps:
                     done = True
                                             
                 if done == True:
                     
                     next_state = np.zeros((state_size,), dtype=np.int)
                     step = max_steps                    
                     total_reward = np.sum(reward_sum)                    
                     scores.append(total_reward)                    
                     memory.add((state, action, reward, next_state, done))
                    
                     print('Episode: {}'.format(episode),
                               'Total reward: {}'.format(total_reward),
Beispiel #5
0
if not os.path.isfile("./model." + market + ".h5"):
    agent = DQNAgent(state_size=state_size,
                     action_size=action_size,
                     memory_size=memory_size,
                     episodes=episodes,
                     episode_length=episode_length,
                     train_interval=train_interval,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     batch_size=batch_size,
                     epsilon_min=epsilon_min)

    # Warming up the agent
    for _ in range(memory_size):
        action = agent.act(state)
        next_state, reward, done, _ = environment.step(action)
        agent.observe(state, action, reward, next_state, done, warming_up=True)
    # Training the agent
    for ep in range(episodes):
        state = environment.reset()
        rew = np.float64(0)
        for _ in range(episode_length):
            action = agent.act(state)
            next_state, reward, done, _ = environment.step(action)
            loss = agent.observe(state, action, reward, next_state, done)
            state = next_state
            rew += np.float64(reward)
        if loss:
            print("Ep:" + str(ep) + "| rew:" + str(round(rew, 2)) + "| eps:" +
                  str(round(agent.epsilon, 2)) + "| loss:" +
                  str(round(loss.history["loss"][0], 4)))
Beispiel #6
0
learning_rate = 0.001

agent = DQNAgent(state_size=state_size,
                 action_size=action_size,
                 memory_size=memory_size,
                 episodes=episodes,
                 episode_length=episode_length,
                 train_interval=train_interval,
                 gamma=gamma,
                 learning_rate=learning_rate,
                 batch_size=batch_size,
                 epsilon_min=epsilon_min)
agent.brain = model

done = False
actions = ['buy', 'hold', 'sell']

while not done:
    action = agent.act(state)
    state, _, done, info = environment.step(action)
    if 'status' in info and info['status'] == 'Closed plot':
        done = True
    balance_X = api.balance()[market[3:]]
    balance_BTC = api.balance()["BTC"]
    action_label = actions[action.index(1)]
    if action_label == "buy":
        api.place_market_order(action_label, balance_X, market + '/BTC')
    elif action_label == "sell":
        api.place_market_order(action_label, balance_BTC, market + '/BTC')
#    print action, state, _, done, info
Beispiel #7
0
    sess.run(tf.global_variables_initializer())

    for i in range(171):
        s = environment.reset()
        #s = OD.DGroup(s)
        ep_reward = 0
        #print("=============")
        #print("s: ", s)
        #print("=============")
        for j in range(3443):
            a = actor.choose_action(s)
            #print("=============")
            #print("s: ", s, " --- ", j)
            #print("=============")
            s_, r, done, _ = environment.step(a)

            #s_ = OD.DGroup(s_)
            #print("=============")
            #print("s_: ", s_, " ---- ", j)
            #print("=============")

            M.store_transition(s, a, r, s_)

            if M.pointer > MEMORY_CAPACITY:
                var = max([var * .9999,
                           VAR_MIN])  # decay the action randomness
                b_M = M.sample(BATCH_SIZE)
                b_s = b_M[:, :state_size]
                b_a = b_M[:, state_size:state_size + action_size]
                b_r = b_M[:, -state_size - 1:-state_size]