def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None): self.env = env self.discount = discount self.model_filename = model_filename self.history_filename = history_filename from keras.optimizers import SGD self.model = MarketPolicyGradientModelBuilder(self.model_filename).getModel() sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True) self.model.compile(loss='mse', optimizer='rmsprop')
def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None, max_memory=100): self.env = env self.discount = discount self.model_filename = model_filename self.history_filename = history_filename self.max_memory = max_memory # 没有利用SGD from keras.optimizers import SGD self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel() sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True) # 利用rmsprop #self.model.compile(loss='mse', optimizer='rmsprop') self.model.compile(loss='binary_crossentropy', optimizer='rmsprop')
class PolicyGradient: def __init__(self, env, env_test, discount=0.99, model_filename=None, history_filename=None): self.env = env self.env_test = env_test self.discount = discount self.model_filename = model_filename self.history_filename = history_filename from keras.optimizers import SGD self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel() sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) self.model.compile(loss='mse', optimizer='rmsprop') self.test_avg_reward_sum = 0 self.avg_reward_sum = 0 def discount_rewards(self, r): discounted_r = np.zeros_like(r) running_add = 0 r = r.flatten() for t in reversed(range(0, r.size)): if r[t] != 0: running_add = 0 running_add = running_add * self.discount + r[t] discounted_r[t] = running_add return discounted_r def paper(self, code): def take_position(): date2position = {} game_over = False observation = self.env_test._reset(code) while not game_over: aprob = self.model.predict(observation)[0] if aprob.shape[0] > 1: action = np.random.choice(env_test.action_space.n, 1, p=aprob / np.sum(aprob))[0] else: action = 0 if np.random.uniform() < aprob else 1 observation, reward, game_over, info = env_test.step(action) date2position[info['dt']] = action return date2position def cum_return(sym, row): date = row['date'] close_rel = row['close_rel'] if date in date2position: position = date2position[date] if 1 == position: result['cum'] *= close_rel return result['cum'] elif 0 == position: result['cum'] *= 2 - close_rel return result['cum'] else: print(date, 'not in date2postion') return result['cum'] def cum_return_bh(sym, row): ## buy and hold def bought_every_day(sym, current_day): return 1 position = bought_every_day(sym, row['date']) if 1 == position: result['cum'] *= row['close_rel'] elif 0 == position: pass elif -1 == position: pass # no short return result['cum'] def cum_return_sh(sym, row): # short and hold def bought_every_day(sym, current_day): return -1 position = bought_every_day(sym, row['date']) if 1 == position: result['cum'] *= row['close_rel'] elif 0 == position: pass elif -1 == position: result['cum'] *= 2 - row['close_rel'] return result['cum'] df = pd.read_csv(os.path.join(local_path, 'data', '%s.csv' % code)) start = self.env_test.startDate end = self.env_test.endDate df = df[(df.date >= start) & (df.date < end)] dates = pd.to_datetime(df.date, format='%Y-%m-%d') df['close_rel'] = (df.close / df.close.shift(1)).fillna(1.0) date2position = take_position() result = {'cum': 1} df_cum = df.apply(lambda x: cum_return('^DJI', x), axis=1) import matplotlib.pyplot as plt plt.plot(dates, df_cum) df_cum = df.apply(lambda x: cum_return_bh('^DJI', x), axis=1) plt.plot(dates, df_cum) df_cum = df.apply(lambda x: cum_return_sh('^DJI', x), axis=1) plt.plot(dates, df_cum) plt.show() def test(self, e, code, verbose=False): env_test = self.env_test model = self.model env_test._reset(code) observation = env_test._reset(code) game_over = False reward_sum = 0 while not game_over: aprob = model.predict(observation)[0] if aprob.shape[0] > 1: action = np.random.choice(env_test.action_space.n, 1, p=aprob / np.sum(aprob))[0] else: action = 0 if np.random.uniform() < aprob else 1 observation, reward, game_over, info = env_test.step(action) reward_sum += float(reward) if verbose > 0: if env_test.actions[action] == "LONG" or env_test.actions[ action] == "SHORT": color = bcolors.FAIL if env_test.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env_test.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env_test.actions, aprob.tolist()) ]))) self.test_avg_reward_sum = self.test_avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], self.test_avg_reward_sum) return toPrint def train(self, max_episode=1000000, max_path_length=200, verbose=True): env = self.env model = self.model for e in range(max_episode): from random import random code = self.env.targetCodes[int(random() * len(self.env.targetCodes))] env._reset(code) observation = env._reset(code) game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] while not game_over: aprob = model.predict(observation)[0] inputs.append(observation) predicteds.append(aprob) if aprob.shape[0] > 1: action = np.random.choice(self.env.action_space.n, 1, p=aprob / np.sum(aprob))[0] y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) else: action = 0 if np.random.uniform() < aprob else 1 y = [float(action)] outputs.append(y) observation, reward, game_over, info = self.env.step(action) reward_sum += float(reward) rewards.append(float(reward)) if verbose > 0: if env.actions[action] == "LONG" or env.actions[ action] == "SHORT": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) self.avg_reward_sum = self.avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], self.avg_reward_sum) print(toPrint, '\t', self.test(e, code)) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in range(dim)] outputs_ = np.vstack(outputs) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) discounted_rewards_ = self.discount_rewards(rewards_) #discounted_rewards_ -= np.mean(discounted_rewards_) discounted_rewards_ /= np.std(discounted_rewards_) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: print(outputs_[i], end=' ') #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward if discounted_reward < 0: outputs_[i] = 1 - outputs_[i] outputs_[i] = outputs_[i] / sum(outputs_[i]) outputs_[i] = np.minimum( 1, np.maximum( 0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * abs(discounted_reward))) if verbose > 1: print(predicteds_[i], outputs_[i], reward, discounted_reward) model.fit(inputs_, outputs_, nb_epoch=1, verbose=0, shuffle=True) model.save_weights(self.model_filename)
class PolicyGradient: def __init__(self, env, discount=0.99, model_filename=None, history_filename=None): self.env = env self.discount = discount self.model_filename = model_filename self.history_filename = history_filename self.model = MarketPolicyGradientModelBuilder( model_filename).getModel() sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) self.model.compile(loss='mse', optimizer='rmsprop') def discount_rewards(self, r): discounted_r = zeros_like(r) running_add = 0 r = r.flatten() for t in reversed(range(0, r.size)): if r[t] != 0: running_add = 0 running_add = running_add * self.discount + r[t] discounted_r[t] = running_add return discounted_r def train(self, max_episode=10, max_path_length=200, verbose=0): env = self.env model = self.model avg_reward_sum = 0. for e in range(max_episode): env._reset() observation = env._reset() game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] while not game_over: aprob = model.predict(observation)[0] inputs.append(observation) predicteds.append(aprob) if aprob.shape[0] > 1: action = random.choice(self.env.action_space.n, 1, p=aprob / sum(aprob))[0] y = zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) else: action = 0 if random.uniform() < aprob else 1 y = [float(action)] outputs.append(y) observation, reward, game_over, info = self.env._step(action) reward_sum += float(reward) rewards.append(float(reward)) if verbose > 0: if env.actions[action] == "LONG" or env.actions[ action] == "SHORT": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print(toPrint) if self.history_filename != None: system("echo %s >> %s" % (toPrint, self.history_filename)) dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [array(inputs_[i]) for i in range(dim)] outputs_ = vstack(outputs) predicteds_ = vstack(predicteds) rewards_ = vstack(rewards) discounted_rewards_ = self.discount_rewards(rewards_) #discounted_rewards_ -= mean(discounted_rewards_) discounted_rewards_ /= std(discounted_rewards_) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: print(outputs_[i], end=' ') #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward if discounted_reward < 0: outputs_[i] = 1 - outputs_[i] outputs_[i] = outputs_[i] / sum(outputs_[i]) outputs_[i] = minimum( 1, maximum( 0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * abs(discounted_reward))) if verbose > 1: print(predicteds_[i], outputs_[i], reward, discounted_reward) model.fit(inputs_, outputs_, nb_epoch=1, verbose=0, shuffle=True) model_json = model.to_json() with open(join(BASE_DIR, "models", self.model_filename + ".json"), "w") as json_file: json_file.write(model_json) model.save_weights( join(BASE_DIR, "models", self.model_filename + ".h5"))
class PolicyGradient: def __init__(self, env, discount=0.99, model_filename=None, history_filename=None): self.env = env self.discount = discount self.model_filename = model_filename self.history_filename = history_filename from keras.optimizers import SGD self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel() # self.model = MarketPolicyGradientModelBuilder().buildModel() sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) self.model.compile(loss='mse', optimizer='rmsprop') def discount_rewards(self, r): discounted_r = np.zeros_like(r) running_add = 0 r = r.flatten() for t in reversed(np.arange(0, r.size)): if r[t] != 0: running_add = 0 running_add = running_add * self.discount + r[t] discounted_r[t] = running_add return discounted_r def train(self, max_episode=1e1, max_path_length=200, threshold=0.5, verbose=0): env = self.env model = self.model avg_reward_sum = 0. for e in np.arange(max_episode): env.reset() observation = env.reset() # print('observation[0].shape:', '\n', observation[0].shape) # print('observation[1].shape:', '\n', observation[1].shape) # print('observation[1]:', '\n', observation[1]) game_over = False reward_sum = 0 last_y = np.array([0, 1]) inputs = [] outputs = [] predicteds = [] rewards = [] count = 0 date_list = [] value_list = [] benchmark_list = [] predict_summary = [] while not game_over: # count += 1 # print('count:',count) aprob = model.predict(observation)[0] # print('aprob:', '\n', aprob) # print('aprob_shape:', '\n', aprob.shape) # print('aprob[0]:', '\n', aprob[0]) # print('aprob[1]:', '\n', aprob[1]) inputs.append(observation) predicteds.append(aprob) if aprob.shape[0] > 1: if max(aprob) > threshold: action = np.argsort(aprob)[-1] # action = np.random.choice(self.env.action_space.n, 1, p = aprob / np.sum(aprob))[0] self.env.last_action = action y = np.zeros([self.env.action_space.n]) y[action] = 1. # print('action:', action) # print('y:', y) last_y = y.copy() outputs.append(y) else: action = 2 outputs.append(last_y) else: action = 0 if np.random.uniform() < aprob else 1 y = [float(action)] outputs.append(y) predict_summary.append(max(aprob)) observation, reward, game_over, info = self.env.step(action) # print('boservation[0]:',observation[0]) reward_sum += float(reward) #print('reward_sum:','\n',reward_sum) rewards.append(float(reward)) #print('rewards:','\n',rewards) date_list.append(info["dt"]) value_list.append(info["rat"]) benchmark_list.append(info["cum"]) if verbose > 0: if action == 2: color = bcolors.OKBLUE if aprob[0] == max( aprob) else bcolors.FAIL print("%s:\t%s\t%.2f\t%.2f\t%.2f\t" % (info["dt"], color + "HOLD!!!" + bcolors.ENDC, reward_sum, info["cum"], info["rat"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) elif env.actions[action] == "LONG" or env.actions[ action] == "SHORT": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"], info["rat"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 fc = bcolors.FAIL if info["cum"] >= 1 else bcolors.OKBLUE fr = bcolors.FAIL if info["rat"] >= 1 else bcolors.OKBLUE bw = bcolors.ENDC toPrint = "%d\t\t%s\t%.2f\t%s\t%s\t%.2f" % ( e, info["code"], reward_sum, fc + ("%.2f" % info["cum"]) + bw, fr + ("%.2f" % info["rat"]) + bw, avg_reward_sum) # toPrint = "%d\t\t%s\t%s\t%.2f\t%.2f\t%.2f" % (e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], info["rat"], avg_reward_sum) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) print(toPrint) # print('avg_reward_sum', '\n', avg_reward_sum) # print('env.actions:',env.actions) M.plot_trade_summary(indices=date_list, value=value_list, benchmark=benchmark_list) plt.hist(predict_summary, bins=200) plt.show() dim = len(inputs[0]) inputs_ = [[] for i in np.arange(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in np.arange(dim)] #print('inputs:', '\n', inputs[0][1]) outputs_ = np.vstack(outputs) # print('outputs_:', '\n', outputs_) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) discounted_rewards_ = self.discount_rewards(rewards_) #discounted_rewards_ -= np.mean(discounted_rewards_) discounted_rewards_ /= np.std(discounted_rewards_) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: print(outputs_[i], ) #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward if discounted_reward < 0: outputs_[i] = 1 - outputs_[i] outputs_[i] = outputs_[i] / sum(outputs_[i]) outputs_[i] = np.minimum( 1, np.maximum( 0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * abs(discounted_reward))) if verbose > 1: print(predicteds_[i], outputs_[i], reward, discounted_reward) # print('inputs_:', '\n', inputs_[0].shape) # print('inputs_:', '\n', inputs_[1].shape) # print('inputs_:', '\n', inputs_[1]) # print('outputs_:', '\n', outputs_) # print('layers:', '\n', model.layers) model.fit(inputs_, outputs_, epochs=1, verbose=0, shuffle=True) model.save_weights('model_1.h5')
class PolicyGradient: def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None, max_memory=100): self.env = env self.discount = discount self.model_filename = model_filename self.history_filename = history_filename self.max_memory = max_memory # 没有利用SGD from keras.optimizers import SGD self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel() sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True) # 利用rmsprop #self.model.compile(loss='mse', optimizer='rmsprop') self.model.compile(loss='binary_crossentropy', optimizer='rmsprop') # 更详细解释: https://blog.csdn.net/heyc861221/article/details/80132054 def discount_rewards(self, r): discounted_r = np.zeros_like(r) running_add = 0 r = r.flatten() # 从后向前推算 for t in reversed(range(0, r.size)): # TODO: running_add 为 0 ? # 应该是参照pong game,因为游戏里面,只有游戏结束了才有一个reward。 # 这里是一个reset,每轮游戏结束,即reward非空,重置running_add # Reset the running sum at a game boundary. # if r[t] != 0: # running_add = 0 # 拆开来就是,run_add 初始为当前reward, 即r[t] # run_add = (r[t] * discount + r[t+1]) * discount + r[t+2] + ... # = r[t] * discount^2 + r[t+1] * discount^1 + r[t+2] + ... # 即公式中的。E(Discount^n * Reward) running_add = running_add * self.discount + r[t] discounted_r[t] = running_add return discounted_r def train(self, max_episode = 10, max_path_length = 200, verbose = 0): env = self.env model = self.model avg_reward_sum = 0. #f_eps = open("episode.csv","w") #write_eps = csv.write(f_eps) for e in range(max_episode): env._reset() observation = env._reset() game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] #f_iter = open("episode_{0}.csv".format(e),"w") #write_iter = csv.writer(f_iter) f_episode = "episode_{0}.csv".format(e) os.system("rm -rf {0}".format(f_episode)) while not game_over: aprob = model.predict(observation)[0] inputs.append(observation) predicteds.append(aprob) if aprob.shape[0] > 1: action = np.random.choice(self.env.action_space.n, 1, p = aprob / np.sum(aprob))[0] y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) else: #action = 0 if np.random.uniform() < aprob else 1 # if aprob = 1.0 reduce it. # 因为uniform返回[0, 1) m_aprob = 0.9 if aprob == 1.0 else aprob action = 0 if np.random.uniform() < m_aprob else 1 y = [float(action)] outputs.append(y) observation, reward, actual_reward, game_over, info = self.env._step(action) reward_sum += float(actual_reward) rewards.append(float(reward)) # check memory for RNN model if len(inputs) > self.max_memory: del inputs[0] del outputs[0] del predicteds[0] del rewards[0] if verbose > 0: if env.actions[action] == "LONG" or env.actions[action] == "SHORT": #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD": color = bcolors.FAIL if env.actions[action] == "LONG" else bcolors.OKBLUE print ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))) #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))) os.system("echo %s >> %s" % ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])), f_episode)) #write_iter.close() avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print (toPrint) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in range(dim)] outputs_ = np.vstack(outputs) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) discounted_rewards_ = self.discount_rewards(rewards_) # TODO: 不做均值平移应该也可以 # 平移后,有可能会导致最小的负值变为正值。改变了正负号。 #discounted_rewards_ -= np.mean(discounted_rewards_) if np.std(discounted_rewards_) != 0.: discounted_rewards_ /= np.std(discounted_rewards_) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: # print (outputs_[i],) print (outputs_[i],) #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward # 修正output, reward<0 亏钱,反转所有的output。 # #if discounted_reward < 0: # outputs_[i] = 1 - outputs_[i] # outputs_[i] = outputs_[i] / sum(outputs_[i]) # softmax的log函数求导后的Gradient ? # http://vsooda.github.io/2017/03/14/softmax-logistic/ # 最终对于softmax层,其反向梯度仅仅是概率值减去label值。 #outputs_[i] = np.minimum(1, np.maximum(0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * abs(discounted_reward))) outputs_[i] = np.minimum(1, np.maximum(0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * discounted_reward)) if verbose > 0: print (predicteds_[i], outputs_[i], reward, discounted_reward) print("fit model input.shape %s, output.shape %s" %( [inputs_[i].shape for i in range(len(inputs_))], outputs_.shape)) np.set_printoptions(linewidth=200, suppress=True) print("currentTargetIndex:", env.currentTargetIndex) #print(inputs_) model.fit(inputs_, outputs_, nb_epoch = 1, verbose = 0, shuffle = True) model.save_weights(self.model_filename)
class PolicyGradient: def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None): self.env = env self.discount = discount self.model_filename = model_filename self.history_filename = history_filename from keras.optimizers import SGD self.model = MarketPolicyGradientModelBuilder(self.model_filename).getModel() sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True) self.model.compile(loss='mse', optimizer='rmsprop') def discount_rewards(self, r): discounted_r = np.zeros_like(r) running_add = 0 r = r.flatten() for t in reversed(xrange(0, r.size)): if r[t] != 0: running_add = 0 running_add = running_add * self.discount + r[t] discounted_r[t] = running_add return discounted_r def train(self, max_episode = 1000, max_path_length = 200, verbose = 0): env = self.env model = self.model avg_reward_sum = 0. target_close = env.get_close() for e in xrange(max_episode): env.reset() observation = env.reset() game_over = False reward_sum = 0 cum_profit = {} pre_action = {} inputs = [] outputs = [] predicteds = [] rewards = [] while not game_over: aprob = model.predict(observation)[0] inputs.append(observation) predicteds.append(aprob) if aprob.shape[0] > 1: action = np.random.choice(self.env.action_space.n, 1, p = aprob / np.sum(aprob))[0] y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) else: action = 0 if np.random.uniform() < aprob else 1 y = [float(action)] outputs.append(y) observation, reward, game_over, info = self.env.step(action) reward_sum += float(reward) cum_profit[info['dt']] = reward_sum rewards.append(float(reward)) if verbose > 0: if env.actions[action] == "LONG" or env.actions[action] == "SHORT": pre_action[info['dt']] = env.actions[action] color = bcolors.FAIL if env.actions[action] == "LONG" else bcolors.OKBLUE print "%s:\t%s\t%d\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] \ + bcolors.ENDC, info['correct_action'], reward_sum, info["cum"]) + \ ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%.2f\t%.2f" % (e, (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + \ ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print toPrint if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) dim = len(inputs[0]) inputs_ = [[] for i in xrange(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in xrange(dim)] outputs_ = np.vstack(outputs)#0 or 1 predicteds_ = np.vstack(predicteds)# probabilty rewards_ = np.vstack(rewards) discounted_rewards_ = self.discount_rewards(rewards_) #discounted_rewards_ -= np.mean(discounted_rewards_) discounted_rewards_ /= np.std(discounted_rewards_) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: print outputs_[i], #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward if discounted_reward < 0: outputs_[i] = 1 - outputs_[i] outputs_[i] = outputs_[i] / sum(outputs_[i]) outputs_[i] = np.minimum(1, np.maximum(0, predicteds_[i] + \ (outputs_[i] - predicteds_[i]) * abs(discounted_reward))) if verbose > 1: print predicteds_[i], outputs_[i], reward, discounted_reward model.fit(inputs_, outputs_, nb_epoch = 1, verbose = 0, shuffle = True) if(e % 5 == 0 and e != 0): test_util.plot_profit(cum_profit, target_close, pre_action, "pg_train_"+str(e)) test_util.get_test_performance(e,'model_pg.h5', model) model.save_weights("model_pg.h5" if self.model_filename == None else self.model_filename, overwrite=True)