class Worker(object): def __init__(self, wid): trading_fee = .007 time_fee = .0073 history_length = 1 #self.env = gym.make(GAME).unwrapped generator = get_CSV_data(filename="./test_6.csv") self.env = SpreadTrading(spread_coefficients=[1], data_generator=generator, trading_fee=trading_fee, time_fee=time_fee, history_length=history_length) self.wid = wid self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() #print("=======") #print(s) #print("========") ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) #print("=========") #print("a: ", a) #print("=========") s_, r, done, _ = self.env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append( (r + 8) / 8) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, )
from tgym.core import DataGenerator from tgym.envs import SpreadTrading from tgym.gens.deterministic import WavySignal generator = WavySignal(period_1=25, period_2=50, epsilon=-0.5) game_length = 200 trading_fee = 0.2 time_fee = 0 # history_length number of historical states in the observation vector. history_length = 2 environment = SpreadTrading(spread_coefficients=[1], data_generator=generator, trading_fee=trading_fee, time_fee=time_fee, history_length=history_length, game_length=game_length) environment.render() while True: action = raw_input("Action: Buy (b) / Sell (s) / Hold (enter): ") if action == 'b': action = [0, 1, 0] elif action == 's': action = [0, 0, 1] else: action = [1, 0, 0] environment.step(action) environment.render()
for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() # training threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update, )) threads[-1].start() COORD.join(threads) # plot reward change and test plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) plt.xlabel('Episode') plt.ylabel('Moving reward') plt.ion() plt.show() #env = gym.make('Pendulum-v0') trading_fee = .007 time_fee = .00724 history_length = 1 generator = get_CSV_data(filename="./test_6.csv") env = SpreadTrading(spread_coefficients=[1], data_generator=generator, trading_fee=trading_fee, time_fee=time_fee, history_length=history_length) while True: s = env.reset() for t in range(3455): env.render() s = env.step(GLOBAL_PPO.choose_action(s))[0]
state = np.reshape(observation, [1, state_size]) while current_step < max_steps: current_step += 1; decay_step += 1 action, explore_probability = exploit_explore(session=sess, model=model, explore_start=explore_start, explore_stop=explore_stop, decay_rate=decay_rate, decay_step=decay_step, state=state, actions=possible_actions) state, reward, done, info = environment.step(action) reward_sum.append(reward) if current_step >= max_steps: done = True if done == True: next_state = np.zeros((state_size,), dtype=np.int) step = max_steps total_reward = np.sum(reward_sum) scores.append(total_reward) memory.add((state, action, reward, next_state, done)) print('Episode: {}'.format(episode), 'Total reward: {}'.format(total_reward),
if not os.path.isfile("./model." + market + ".h5"): agent = DQNAgent(state_size=state_size, action_size=action_size, memory_size=memory_size, episodes=episodes, episode_length=episode_length, train_interval=train_interval, gamma=gamma, learning_rate=learning_rate, batch_size=batch_size, epsilon_min=epsilon_min) # Warming up the agent for _ in range(memory_size): action = agent.act(state) next_state, reward, done, _ = environment.step(action) agent.observe(state, action, reward, next_state, done, warming_up=True) # Training the agent for ep in range(episodes): state = environment.reset() rew = np.float64(0) for _ in range(episode_length): action = agent.act(state) next_state, reward, done, _ = environment.step(action) loss = agent.observe(state, action, reward, next_state, done) state = next_state rew += np.float64(reward) if loss: print("Ep:" + str(ep) + "| rew:" + str(round(rew, 2)) + "| eps:" + str(round(agent.epsilon, 2)) + "| loss:" + str(round(loss.history["loss"][0], 4)))
learning_rate = 0.001 agent = DQNAgent(state_size=state_size, action_size=action_size, memory_size=memory_size, episodes=episodes, episode_length=episode_length, train_interval=train_interval, gamma=gamma, learning_rate=learning_rate, batch_size=batch_size, epsilon_min=epsilon_min) agent.brain = model done = False actions = ['buy', 'hold', 'sell'] while not done: action = agent.act(state) state, _, done, info = environment.step(action) if 'status' in info and info['status'] == 'Closed plot': done = True balance_X = api.balance()[market[3:]] balance_BTC = api.balance()["BTC"] action_label = actions[action.index(1)] if action_label == "buy": api.place_market_order(action_label, balance_X, market + '/BTC') elif action_label == "sell": api.place_market_order(action_label, balance_BTC, market + '/BTC') # print action, state, _, done, info
sess.run(tf.global_variables_initializer()) for i in range(171): s = environment.reset() #s = OD.DGroup(s) ep_reward = 0 #print("=============") #print("s: ", s) #print("=============") for j in range(3443): a = actor.choose_action(s) #print("=============") #print("s: ", s, " --- ", j) #print("=============") s_, r, done, _ = environment.step(a) #s_ = OD.DGroup(s_) #print("=============") #print("s_: ", s_, " ---- ", j) #print("=============") M.store_transition(s, a, r, s_) if M.pointer > MEMORY_CAPACITY: var = max([var * .9999, VAR_MIN]) # decay the action randomness b_M = M.sample(BATCH_SIZE) b_s = b_M[:, :state_size] b_a = b_M[:, state_size:state_size + action_size] b_r = b_M[:, -state_size - 1:-state_size]