def main(stock_name, model_name): # if len(sys.argv) != 3: # print("Usage: python evaluate.py [stock] [model]") # exit() # stock_name, model_name = sys.argv[1], sys.argv[2] model = load_model("models/" + model_name) window_size = model.layers[0].input.shape.as_list()[1] agent = Agent(window_size, True, model_name) data = getStockDataVec(stock_name) l = len(data) - 1 batch_size = 32 state = getState(data, 0, window_size + 1) total_profit = 0 agent.inventory = [] for t in range(l): action = agent.act(state) # sit next_state = getState(data, t + 1, window_size + 1) reward = 0 if action == 1: # buy agent.inventory.append(data[t]) print("Buy: " + formatPrice(data[t])) elif action == 2 and len(agent.inventory) > 0: # sell bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price)) done = True if t == l - 1 else False agent.memory.append((state, action, reward, next_state, done)) state = next_state if done: print("--------------------------------") print(stock_name + " Total Profit: " + formatPrice(total_profit)) print("--------------------------------")
def Trainer(stock_name, window_size, episode_count): agent = Agent(window_size) data = getStockDataVec(stock_name) l = len(data) - 1 batch_size = 32 for e in range(episode_count + 1): print("Episode " + str(e) + "/" + str(episode_count)) state = getState(data, 0, window_size + 1) total_profit = 0 agent.inventory = [] for t in range(l): action = agent.act(state) # sit next_state = getState(data, t + 1, window_size + 1) reward = 0 if action == 1: # buy agent.inventory.append(data[t]) print("Buy: " + formatPrice(data[t])) elif action == 2 and len(agent.inventory) > 0: # sell bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price)) done = True if t == l - 1 else False agent.memory.append((state, action, reward, next_state, done)) state = next_state if done: print("--------------------------------") print("Total Profit: " + formatPrice(total_profit)) print("--------------------------------") if len(agent.memory) > batch_size: agent.expReplay(batch_size) if e % 10 == 0: agent.model.save("models/model_ep" + str(e))
def eval_model(stock_name, model_name): # Agent window_size = get_window_size(model_name) agent = Agent(window_size, True, model_name) # Environment env = SimpleTradeEnv(stock_name, window_size, agent) # Main loop state = env.reset() done = False while not done: action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.memory.append((state, action, reward, next_state, done)) state = next_state return env.total_profit
stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int( sys.argv[3]) agent = Agent(window_size) data = getStockDataVec(stock_name) l = len(data) - 1 for e in range(episode_count + 1): print("Episode " + str(e) + "/" + str(episode_count)) state = getState(data, 0, window_size + 1) total_profit = 0 agent.inventory = [] for t in range(l): action = agent.act(state) # sit next_state = getState(data, t + 1, window_size + 1) reward = 0 if action == 1: # buy agent.inventory.append(data[t]) # print("Buy: " + formatPrice(data[t])) elif action == 2 and len(agent.inventory) > 0: # sell bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price # print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price))
def main(): ## Simulator simulator_args = {} simulator_args['config'] = 'config/config.cfg' simulator_args['resolution'] = (widthIn,heightIn) simulator_args['frame_skip'] = 1 simulator_args['color_mode'] = 'RGB24' simulator_args['game_args'] = "+name ICO +colorset 7" ## Agent agent_args = {} # preprocessing preprocess_input_images = lambda x: x / 255. - 0.5 agent_args['preprocess_input_images'] = lambda x: x / 255. - 0.5 agent_args['preprocess_input_measurements'] = lambda x: x / 100. - 0.5 agent_args['num_future_steps'] = 6 pred_scale_coeffs = np.expand_dims( (np.expand_dims(np.array([8., 40., 1.]), 1) * np.ones((1, agent_args['num_future_steps']))).flatten(), 0) agent_args['meas_for_net_init'] = range(3) agent_args['meas_for_manual_init'] = range(3, 16) agent_args['resolution'] = (width,height) # just use grayscale for nnet inputs agent_args['num_channels'] = 1 # net parameters agent_args['net_type'] = "fc" # agent_args['net_type'] = "conv" agent_args['conv_params'] = np.array([(16, 5, 4), (32, 3, 2), (64, 3, 2), (128, 3, 2)], dtype=[('out_channels', int), ('kernel', int), ('stride', int)]) agent_args['fc_img_params'] = np.array([(128,)], dtype=[('out_dims', int)]) agent_args['fc_meas_params'] = np.array([(128,), (128,), (128,)], dtype=[('out_dims', int)]) agent_args['fc_joint_params'] = np.array([(256,), (256,), (-1,)], dtype=[('out_dims', int)]) agent_args['target_dim'] = agent_args['num_future_steps'] * len(agent_args['meas_for_net_init']) agent_args['n_actions'] = 7 # experiment arguments agent_args['test_objective_params'] = (np.array([5, 11, 17]), np.array([1., 1., 1.])) agent_args['history_length'] = 3 agent_args['history_length_ico'] = 3 historyLen = agent_args['history_length'] print ("HistoryLen: ", historyLen) print('starting simulator') simulator = DoomSimulator(simulator_args) num_channels = simulator.num_channels print('started simulator') agent_args['state_imgs_shape'] = ( historyLen * num_channels, simulator.resolution[1], simulator.resolution[0]) agent_args['n_ffnet_input'] = (agent_args['resolution'][0]*agent_args['resolution'][1]) agent_args['n_ffnet_hidden'] = np.array([50,5]) agent_args['n_ffnet_output'] = 1 agent_args['n_ffnet_act'] = 7 agent_args['n_ffnet_meas'] = simulator.num_meas agent_args['learning_rate'] = 1E-4 modelDir = os.path.join(os.path.expanduser("~"), "Dev/GameAI/vizdoom_cig2017/icodoom/ICO1/Models") if 'meas_for_net_init' in agent_args: agent_args['meas_for_net'] = [] for ns in range(historyLen): agent_args['meas_for_net'] += [i + simulator.num_meas * ns for i in agent_args['meas_for_net_init']] agent_args['meas_for_net'] = np.array(agent_args['meas_for_net']) else: agent_args['meas_for_net'] = np.arange(historyLen * simulator.num_meas) if len(agent_args['meas_for_manual_init']) > 0: agent_args['meas_for_manual'] = np.array([i + simulator.num_meas * (historyLen - 1) for i in agent_args[ 'meas_for_manual_init']]) # current timestep is the last in the stack else: agent_args['meas_for_manual'] = [] agent_args['state_meas_shape'] = (len(agent_args['meas_for_net']),) # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) # agent = Agent(sess, agent_args) # agent.load('/home/paul/Dev/GameAI/vizdoom_cig2017/icolearner/ICO1/checkpoints/ICO-8600') # print("model loaded..") # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) img_buffer = np.zeros( (historyLen, simulator.resolution[1], simulator.resolution[0], num_channels), dtype='uint8') meas_buffer = np.zeros((historyLen, simulator.num_meas)) act_buffer = np.zeros((historyLen, 7)) act_buffer_ico = np.zeros((agent_args['history_length_ico'], 7)) curr_step = 0 old_step = -1 term = False print ("state_meas_shape: ", meas_buffer.shape, " == ", agent_args['state_meas_shape']) print ("act_buffer_shape: ", act_buffer.shape) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,log_device_placement=False)) ag = Agent(sess, agent_args) if (os.path.isfile("checkpoints/checkpoint")): ag.load('/home/paul/Dev/GameAI/vizdoom_cig2017/icodoom/ICO1/checkpoints/') print("model loaded..") else: print ("No model file, initialising...") diff_y = 0 diff_x = 0 diff_z = 0 diff_theta = 0 iter = 1 epoch = 200 radialFlowLeft = 30. radialFlowRight = 30. radialFlowInertia = 0.4 radialGain = 4. rotationGain = 50. errorThresh = 10. updatePtsFreq = 50 skipImage = 1 skipImageICO = 5 reflexGain = 1E-4 flowGain = 0. netGain = 10. oldHealth = 0. # create masks for left and right visual fields - note that these only cover the upper half of the image # this is to help prevent the tracking getting confused by the floor pattern half_height = round(height/2) half_width = round(width/2) maskLeft = np.zeros([height, width], np.uint8) maskLeft[half_height:, :half_width] = 1. maskRight = np.zeros([height, width], np.uint8) maskRight[half_height:, half_width:] = 1. lk_params = dict(winSize=(15, 15), maxLevel=2, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03)) feature_params = dict(maxCorners=500, qualityLevel=0.03, minDistance=7, blockSize=7) imgCentre = np.array([int(simulator_args['resolution'][0] / 2), int(simulator_args['resolution'][1] /2)]) print ("Image centre: ", imgCentre) rawInputs = np.zeros((height, width)) cheatInputs = np.zeros((width, height)) input_buff = np.zeros((1,width*height)) target_buff = np.zeros((1,1)) meas_buff = np.zeros((1,simulator.num_meas)) netOut = 0. netErr = np.zeros((width,height)) delta = 0. shoot = 0 reflexOn = False iter = 0 while not term: if curr_step < historyLen: curr_act = np.zeros(7).tolist() img, meas, rwrd, term = simulator.step(curr_act) print("Image: ", img.shape, " max: ", np.amax(img), " min: ", np.amin(img)) if curr_step == 0: p0Left = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskLeft, **feature_params) p0Right = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskRight, **feature_params) img_buffer[curr_step % historyLen] = img meas_buffer[curr_step % historyLen] = meas act_buffer[curr_step % historyLen] = curr_act[:7] else: img1 = img_buffer[(curr_step-2) % historyLen,:,:,:] img2 = img_buffer[(curr_step-1) % historyLen,:,:,:] state = simulator._game.get_state() stateImg = state.screen_buffer if(curr_step % updatePtsFreq == 0): p0Left = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskLeft, **feature_params) p0Right = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskRight, **feature_params) p1Left, st, err = cv2.calcOpticalFlowPyrLK(img1[:,:,0], img2[:,:,0], p0Left, None, **lk_params) p1Right, st, err = cv2.calcOpticalFlowPyrLK(img1[:,:,0], img2[:,:,0], p0Right, None, **lk_params) flowLeft = (p1Left - p0Left)[:,0,:] flowRight = (p1Right - p0Right)[:,0,:] radialFlowTmpLeft = 0 radialFlowTmpRight = 0 for i in range(0, len(p0Left)): radialFlowTmpLeft += ((p0Left[i,0,:] - imgCentre)).dot(flowLeft[i,:]) / float(len(p0Left)) for i in range(0, len(p0Right)): radialFlowTmpRight += ((p0Right[i,0,:] - imgCentre)).dot(flowRight[i,:]) / float(len(p0Right)) rotation = act_buffer[(curr_step - 1) % historyLen][6] forward = act_buffer[(curr_step - 1) % historyLen][3] # keep separate radial errors for left and right fields radialFlowLeft = radialFlowLeft + radialFlowInertia * (radialFlowTmpLeft - radialFlowLeft) radialFlowRight = radialFlowRight + radialFlowInertia * (radialFlowTmpRight - radialFlowRight) expectFlowLeft = radialGain * forward + (rotationGain * rotation if rotation < 0. else 0.) expectFlowRight = radialGain * forward - (rotationGain * rotation if rotation > 0. else 0.) flowErrorLeft = forward * (expectFlowLeft - radialFlowLeft) / (1. + rotationGain * np.abs(rotation)) flowErrorRight = forward * (expectFlowRight - radialFlowRight) / (1. + rotationGain * np.abs(rotation)) flowErrorLeft = flowErrorLeft if flowErrorLeft > 0. else 0. flowErrorRight = flowErrorRight if flowErrorRight > 0. else 0. icoSteer = 0. if curr_step > 100: health = meas[1] if (health<0.1): reflexOn = False iter = 0 # Don't run any networks when the player is dead! if (health < 101. and health > 0.): icoInSteer = flowGain * ((flowErrorRight - errorThresh) if (flowErrorRight - errorThresh) > 0. else 0. - flowGain * (flowErrorLeft - errorThresh) if (flowErrorLeft - errorThresh) > 0. else 0. ) centre, bottomLeft, topRight, colourStrength = getMaxColourPos(stateImg, [255, 0, 0]) colourSteer = imgCentre[0] cheatInputs = stateImg*1. if(len(bottomLeft)>0 and len(topRight)>0 and ((topRight[0] - bottomLeft[0]) < width/3) and ((topRight[1] - bottomLeft[1]) < height/2)): colourSteer = bottomLeft[0] + int(0.5 * (topRight[0] - bottomLeft[0])) # cv2.imwrite("/home/paul/tmp/Backup/rect-" + str(curr_step) + ".jpg", cheatInputs) cv2.arrowedLine(cheatInputs, (colourSteer, imgCentre[1]+10), (colourSteer, imgCentre[1]), color=(255,255,255), thickness=2) rawInputs = np.array(np.sum(stateImg, axis=2) / 3) cheatInputs = np.array(np.sum(cheatInputs, axis=2) / 3) # cv2.imwrite("/home/paul/tmp/Backup/cheat-" + str(curr_step) + ".jpg", cheatInputs) input_buff[0,:] = np.ndarray.flatten(cheatInputs) input_buff = input_buff - np.mean(input_buff) input_buff = input_buff / np.sqrt(np.var(input_buff)) # we want the reflex to be delayed wrt to the image input, so that the image is. Otherwise the learning can # never reduce the error to zero no matter how good the controller. if (iter>2): delta = (float(colourSteer) - float(imgCentre[0]))/float(width) else: delta = 0 if(iter>2): if(np.abs(delta) < 0.01): shoot = 1 target_buff[...] = delta + netOut # target_buff[...] = delta # target_buff[...] = 0.2 meas_buff[0,:] = meas ag.act(input_buff, meas, target_buff) if(ag.net_type == 'conv'): netOut = np.ndarray.flatten(ag.ext_covnet_output)[0].flatten()[0] elif(ag.net_type == 'fc'): netOut = np.ndarray.flatten(ag.ext_fcnet_output)[0].flatten()[0] print (" *** ", delta, delta + netOut, netGain*netOut, ag.learning_rate) diff_theta = 0.6 * max(min((icoInSteer), 5.), -5.) netErr[:,:] = 0. diff_theta = diff_theta + reflexGain * colourStrength * delta curr_act = np.zeros(7).tolist() curr_act[0] = 0 curr_act[1] = 0 curr_act[2] = 0 #shoot curr_act[3] = curr_act[3] + diff_z curr_act[4] = 0 curr_act[5] = 0. curr_act[6] = diff_theta + netGain*netOut iter += 1 if (curr_step % epoch == 0): ag.save('/home/paul/Dev/GameAI/vizdoom_cig2017/icodoom/ICO1/checkpoints/BP', curr_step) img, meas, rwrd, term = simulator.step(curr_act) if (not (meas is None)) and meas[0] > 30.: meas[0] = 30. if not term: img_buffer[curr_step % historyLen] = img meas_buffer[curr_step % historyLen] = meas act_buffer[curr_step % historyLen] = curr_act[:7] curr_step += 1 simulator.close_game()
stock_name, model_name = sys.argv[1], sys.argv[2] model = load_model("models/" + model_name) window_size = model.layers[0].input.shape.as_list()[1] agent = Agent(window_size, True, model_name) data = getStockDataVec(stock_name) l = len(data) - 1 batch_size = 32 state = getState(data, 0, window_size + 1) total_profit = 0 agent.inventory = [] for t in xrange(l): action = agent.act(state) # sit next_state = getState(data, t + 1, window_size + 1) reward = 0 if action == 1: # buy agent.inventory.append(data[t]) print "Buy: " + formatPrice(data[t]) elif action == 2 and len(agent.inventory) > 0: # sell bought_price = agent.inventory.pop(0) reward = max(data[t] - bought_price, 0) total_profit += data[t] - bought_price print "Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price)
# Set up the environment portfolio = Portfolio(start_cash, trade_size, price_iter) env = Environment(pairs, state_iter, portfolio) state_shape = env.state().shape agent = Agent(state_shape[0], is_eval=True, model_location=model_location) num_steps = 1 metrics = pd.DataFrame(columns=['tick', 'action', 'value']) while True: tick = env.current_tick[0] cur_state = env.state() action = agent.act(cur_state, env.valid_actions()) try: reward = env.execute(action) value = env.portfolio.valueAtTime(env.current_tick[0]) metrics = metrics.append( { 'tick': tick, 'action': action, 'value': value }, ignore_index=True) except StopIteration: print("Evaluation ended after processing", num_steps - 1, "ticks") print(str(env)) print(str(agent))
# Visualizing the learning # _, ax = plt.subplots() agent = Agent(state_shape[0], is_eval=True, actor_model_location=actor_model_location, critic_model_location=critic_model_location) num_steps = 1 metrics = pd.DataFrame(columns=['tick', 'action', 'value']) while True: tick = env.current_tick[0] cur_state = env.state() action = agent.act(cur_state) try: reward = env.execute(action) value = env.portfolio.valueAtTime(env.current_tick[0]) metrics = metrics.append( { 'tick': tick, 'action': action, 'value': value }, ignore_index=True) except StopIteration: print("Training ended after processing", num_steps - 1, "ticks") print(str(env)) print(str(agent))
hold_size = 0 for e in range(episode_count + 1): print("Episode " + str(e) + "/" + str(episode_count)) maxd = data[0] mind = data[0] history = [] pp = np.mean(history) if history else float('nan') state = getState(data, 0, window_size, maxd, mind, len(agent.inventory), agent.money / 1000000, pp) total_profit = 0 agent.money = 1000000 agent.inventory = [] for t in range(l): history.append(data[t]) action = agent.act(state, data[t]) # print(action) if data[t] > maxd: maxd = data[t] elif data[t] < mind: mind = data[t] # sit pp = np.mean(history) if history else float('nan') next_state = getState(data, t + 1, window_size, maxd, mind, len(agent.inventory), agent.money / 1000000, pp) agent.inventory.sort() reward = 0 if action == 1: #print(np.mean(history)) if len(agent.inventory) == 0: reward += 10
def main(): ## Simulator simulator_args = {} simulator_args['config'] = 'config/config.cfg' simulator_args['resolution'] = (160, 120) simulator_args['frame_skip'] = 2 simulator_args['color_mode'] = 'GRAY' simulator_args['game_args'] = "+name IntelAct +colorset 7" ## Agent agent_args = {} # preprocessing agent_args['preprocess_input_images'] = lambda x: x / 255. - 0.5 agent_args['preprocess_input_measurements'] = lambda x: x / 100. - 0.5 agent_args['num_future_steps'] = 6 pred_scale_coeffs = np.expand_dims( (np.expand_dims(np.array([8., 40., 1.]), 1) * np.ones( (1, agent_args['num_future_steps']))).flatten(), 0) agent_args['postprocess_predictions'] = lambda x: x * pred_scale_coeffs agent_args['discrete_controls_manual'] = range(6, 12) agent_args['meas_for_net_init'] = range(3) agent_args['meas_for_manual_init'] = range(3, 16) agent_args['opposite_button_pairs'] = [(0, 1), (2, 3)] # net parameters agent_args['conv_params'] = np.array([(16, 5, 4), (32, 3, 2), (64, 3, 2), (128, 3, 2)], dtype=[('out_channels', int), ('kernel', int), ('stride', int)]) agent_args['fc_img_params'] = np.array([(128, )], dtype=[('out_dims', int)]) agent_args['fc_meas_params'] = np.array([(128, ), (128, ), (128, )], dtype=[('out_dims', int)]) agent_args['fc_joint_params'] = np.array([(256, ), (256, ), (-1, )], dtype=[('out_dims', int)]) agent_args['target_dim'] = agent_args['num_future_steps'] * len( agent_args['meas_for_net_init']) # experiment arguments agent_args['test_objective_params'] = (np.array([5, 11, 17]), np.array([1., 1., 1.])) agent_args['history_length'] = 1 agent_args['test_checkpoint'] = 'model' print('starting simulator') simulator = DoomSimulator(simulator_args) print('started simulator') agent_args['discrete_controls'] = simulator.discrete_controls agent_args['continuous_controls'] = simulator.continuous_controls agent_args['state_imgs_shape'] = (agent_args['history_length'] * simulator.num_channels, simulator.resolution[1], simulator.resolution[0]) if 'meas_for_net_init' in agent_args: agent_args['meas_for_net'] = [] for ns in range(agent_args['history_length']): agent_args['meas_for_net'] += [ i + simulator.num_meas * ns for i in agent_args['meas_for_net_init'] ] agent_args['meas_for_net'] = np.array(agent_args['meas_for_net']) else: agent_args['meas_for_net'] = np.arange(agent_args['history_length'] * simulator.num_meas) if len(agent_args['meas_for_manual_init']) > 0: agent_args['meas_for_manual'] = np.array([ i + simulator.num_meas * (agent_args['history_length'] - 1) for i in agent_args['meas_for_manual_init'] ]) # current timestep is the last in the stack else: agent_args['meas_for_manual'] = [] agent_args['state_meas_shape'] = (len(agent_args['meas_for_net']), ) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) ag = Agent(sess, agent_args) ag.load('./checkpoints') img_buffer = np.zeros( (agent_args['history_length'], simulator.num_channels, simulator.resolution[1], simulator.resolution[0])) meas_buffer = np.zeros((agent_args['history_length'], simulator.num_meas)) curr_step = 0 term = False acts_to_replace = [ a + b + d + e for a in [[0, 0], [1, 1]] for b in [[0, 0], [1, 1]] for d in [[0]] for e in [[0], [1]] ] print(acts_to_replace) replacement_act = [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] #MOVE_FORWARD MOVE_BACKWARD TURN_LEFT TURN_RIGHT ATTACK SPEED SELECT_WEAPON2 SELECT_WEAPON3 SELECT_WEAPON4 SELECT_WEAPON5 SELECT_WEAPON6 SELECT_WEAPON7 while not term: if curr_step < agent_args['history_length']: img, meas, rwrd, term = simulator.step( np.squeeze(ag.random_actions(1)).tolist()) else: state_imgs = np.transpose( np.reshape( img_buffer[np.arange( curr_step - agent_args['history_length'], curr_step) % agent_args['history_length']], (1, ) + agent_args['state_imgs_shape']), [0, 2, 3, 1]) state_meas = np.reshape( meas_buffer[np.arange(curr_step - agent_args['history_length'], curr_step) % agent_args['history_length']], (1, agent_args['history_length'] * simulator.num_meas)) curr_act = np.squeeze( ag.act(state_imgs, state_meas, agent_args['test_objective_params'])[0]).tolist() if curr_act[:6] in acts_to_replace: curr_act = replacement_act img, meas, rwrd, term = simulator.step(curr_act) if (not (meas is None)) and meas[0] > 30.: meas[0] = 30. if not term: img_buffer[curr_step % agent_args['history_length']] = img meas_buffer[curr_step % agent_args['history_length']] = meas curr_step += 1 simulator.close_game()
def train(): profits_list = [ ] # Will hold list of all profits as we go through training # Given command line input as below # if len(sys.argv) != 4: # print("Usage: python train.py [stock] [window] [episodes]") # exit() with open(os.path.join(os.path.dirname(__file__), 'config.yml'), 'r') as stream: config = yaml.load(stream) # Unpackage data from terminal/config # stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]) stock_name, window_size, episode_count = config['stock_name'], config[ 'window_size'], config["num_epochs"] num_tech_indicators = config['num_tech_indicators'] agent = Agent(window_size + num_tech_indicators, config) data = getStockDataVec(stock_name) env = TradingEnv(data, window_size) l = len(data) - 1 for e in range(episode_count + 1): print("Episode " + str(e) + "/" + str(episode_count)) state = env.get_state(0) env.reset_holdings() for t in range(l): action = agent.act(state) # sit next_state = env.get_state(t + 1) reward = 0 if action == 1: # buy #remembers the price bought at t, and the time bought env.buy(t) # print("Buy: " + formatPrice(data[t])) elif action == 2: # sell reward, profit = env.sell(t) # print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(profit)) done = True if t == l - 1 else False # Push all values to memory agent.memory.push(state, action, next_state, reward) state = next_state total_profit = env.net_profit(t) max_staked = env.max_spent if done: percent_return = total_profit / max_staked * 100 print("--------------------------------") print("Total Profit: " + formatPrice(total_profit)) print("Max staked: " + formatPrice(max_staked)) print("Percent return: " + "{0:.2f}%".format(percent_return)) print("--------------------------------") profits_list.append((total_profit, percent_return)) # print(profits_list) agent.optimize() if e % config['save_freq'] == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) torch.save(agent.policy_net, config['policy_model']) torch.save(agent.target_net, config['target_model'])
def test_model(episode_count, data_test, data_test_open, start_balance, model_name): # Define arrays to store per episode values Act_datasize = len(data_test) Act_Bench_Stock1_Bal = int( np.floor((start_balance / 2) / data_test_open[0])) Act_Bench_Open_cash = start_balance / 2 model = load_model("models/" + model_name) # Actual run episode_count = 0 # Define arrays to store per episode values total_Prof = [] total_stock1bal = [] total_open_cash = [] total_port_value = [] total_days_played = [] Act_total_Prof = [] Act_total_stock1bal = [] Act_total_open_cash = [] Act_total_port_value = [] Act_total_days_played = [] actions_done_perday = [] portfolio_value = [] for e in range(1): # here we run only for 1 episode, as it is Test run Bal_stock1_t2 = Act_Bench_Stock1_Bal done = False open_cash_t2 = Act_Bench_Open_cash total_profit = 0 reward = 0 # Initialize Agent agent_test = Agent(8, is_eval=True, model_name=model_name) # agent = Agent(8) agent_test.inventory1 = [] for i in range(Bal_stock1_t2): agent_test.inventory1.append(data_test_open[0]) # Timestep delta to make sure that with time reward increases for taking action timestep_delta = 0 # Running episode over all days in the datasize for t in range(Act_datasize): print("..........") print(data_test.iloc[t, 0]) state_class_obj = State(data_test_open, Bal_stock1_t2, open_cash_t2, t) state_array_obj = state_class_obj.getState() action = agent_test.act(state_array_obj) print("Total portfolio value: " + str(state_class_obj.portfolio_value) + " stock 1 number: " + str(len(agent_test.inventory1)) + " open cash" + str(state_class_obj.open_cash)) # reward should be more as time goes further. We will remove reward_timedelta from actual reward # reward_timedelta=(datasize-t)*timestep_delta change_percent_stock1 = (state_class_obj.Stock1Price - state_class_obj.fiveday_stock1 ) / state_class_obj.fiveday_stock1 * 100 # print("change_percent_stock1: "+str(change_percent_stock1)) # print("change_percent_stock2: "+str(change_percent_stock2)) if action == 0: # buy stock 1 if state_class_obj.Stock1Price > state_class_obj.open_cash: ''' print("Buy stock 1 when it did not have cash, so bankrupt, end of episode") reward=-reward_timedelta*10 done = True ''' done = True # end episode else: # print("In Buy stock 1") agent_test.inventory1.append(data_test_open[t]) Bal_stock1_t2 = len(agent_test.inventory1) open_cash_t2 = state_class_obj.open_cash - state_class_obj.Stock1Price # Here we are buying 1 stock if action == 1: # sell stock 1 if state_class_obj.Stock1Blnc < 1: # print("sold stock 2 when it did not have stock 2, so bankrupt, end of episode") done = True # end episode else: # print("In sell stock 1") agent_test.inventory1.pop(0) Bal_stock1_t2 = len(agent_test.inventory1) # Bal_stock2_t2 = len(agent_test.inventory2) open_cash_t2 = state_class_obj.open_cash + state_class_obj.Stock1Price # State[0] is the price of stock 1. Here we are buying 1 stoc if action == 2: # Do nothing action Bal_stock1_t2 = len(agent_test.inventory1) # Bal_stock2_t2 = len(agent_test.inventory2) # print("Do nothing") if t == Act_datasize - 1: # print("t==datasize") done = True next_state_class_obj = State(data_test_open, Bal_stock1_t2, open_cash_t2, t) next_state_array_obj = next_state_class_obj.getState() else: # print("t!=datasize"+str(open_cash_t2)) next_state_class_obj = State(data_test_open, Bal_stock1_t2, open_cash_t2, t + 1) next_state_array_obj = next_state_class_obj.getState() # print("Action is "+str(action)+" reward is" + str(reward)) actions_done_perday.append(action) portfolio_value.append(next_state_class_obj.portfolio_value) if done == True: print("--------------------------------") print("Total Profit: " + formatPrice(next_state_class_obj.portfolio_value - start_balance)) print("Total No. of days played: " + str(t) + " out of overall days: " + str(Act_datasize)) print("Total portfolio value: " + str(next_state_class_obj.portfolio_value) + " stock 1 number: " + str(len(agent_test.inventory1)) + " open cash" + str(next_state_class_obj.open_cash)) # + " stock 2 number: " + str(len(agent_test.inventory2)) Act_total_Prof.append(total_profit) Act_total_stock1bal.append(len(agent_test.inventory1)) # Act_total_stock2bal.append(len(agent_test.inventory2)) Act_total_open_cash.append(state_class_obj.open_cash) Act_total_port_value.append(state_class_obj.portfolio_value) Act_total_days_played.append(t) print("--------------------------------") state_class_obj.reset() break opencash = state_class_obj.open_cash return total_profit, portfolio_value, opencash, Act_total_days_played
def train_model(episode_count, start_balance, data_train, training, date): from os import path # Define arrays to store per episode values total_Prof = [] total_stock1bal = [] total_open_cash = [] total_port_value = [] total_days_played = [] batch_size = 64 # Training run for e in range(episode_count + 1): print("..........") print("Episode " + str(e) + "/" + str(episode_count)) Bal_stock1 = int(np.floor((start_balance / 2) / data_train[0])) open_cash = start_balance / 2 datasize = training done = False total_profit = 0 reward = 0 max = 0 # Initialize Agent agent = Agent(5) agent.inventory1 = [] for i in range(Bal_stock1): agent.inventory1.append(data_train[0]) # Timestep delta to make sure that with time reward increases for taking action # timestep_delta=0 # Running episode over all days in the datasize for t in range(datasize): # print("..........") # print(pd_data1_train.iloc[t,0]) state_class_obj = State(data_train, Bal_stock1, open_cash, t) state_array_obj = state_class_obj.getState() action = agent.act(state_array_obj) change_percent_stock1 = (state_class_obj.Stock1Price - state_class_obj.fiveday_stock1 ) / state_class_obj.fiveday_stock1 * 100 # profit=data1_train[t]-agent.inventory1(-1) # print("change_percent_stock1: "+str(change_percent_stock1)) # if action not in [0,1,2]: # reward= reward-1000 # decide_reward(action,data_train) if action == 0: # buy stock 1 if state_class_obj.Stock1Price > state_class_obj.open_cash: ''' print("Buy stock 1 when it did not have cash, so bankrupt, end of episode") reward=-reward_timedelta*10 done = True ''' reward = reward - 4000 # done = True # end episode else: # print("In Buy stock 1") agent.inventory1.append(data_train[t]) Bal_stock1_t1 = len(agent.inventory1) # Bal_stock2_t1 = len(agent.inventory2) open_cash_t1 = state_class_obj.open_cash - state_class_obj.Stock1Price # Here we are buying 1 stock # needs to be reviewed if (state_class_obj.open_cash < 500): reward = reward - 2000 elif (0.1 * Bal_stock1_t1 > Bal_stock1): reward = reward - (1000 * Bal_stock1_t1) # elif (abs(change_percent_stock1) <= 2): # reward = reward-2000 else: reward = reward - (change_percent_stock1 * 1000) if action == 1: # sell stock 1 if state_class_obj.Stock1Blnc < 1: # print("sold stock 2 when it did not have stock 2, so bankrupt, end of episode") reward = reward - 4000 # done = True # end episode else: # print("In sell stock 1") bought_price1 = agent.inventory1.pop(0) Bal_stock1_t1 = len(agent.inventory1) total_profit += data_train[t] - bought_price1 # Bal_stock2_t1 = len(agent.inventory2) open_cash_t1 = state_class_obj.open_cash + state_class_obj.Stock1Price # State[0] is the price of stock 1. Here we are selling 1 stoc if (0.1 * Bal_stock1_t1 > Bal_stock1): reward = reward - (1000 * Bal_stock1_t1) # elif (abs(change_percent_stock1) <= 2): # reward = -1000 elif total_profit > 200: reward = reward + (2000 * total_profit) else: reward = reward + ( change_percent_stock1 * 100 ) # State[0] is the price of stock 1. Here we are selling 1 stock # total_profit += data1_train[t] - bought_price1 # print("reward for sell stock1 " + str(reward)) if action == 2: # Do nothing action # if (abs(change_percent_stock1) <= 2): # reward = 100 if (state_class_obj.open_cash < 0.05 * start_balance): reward += 2000 else: reward = reward - 2000 Bal_stock1_t1 = len(agent.inventory1) # Bal_stock2_t1 = len(agent.inventory2) open_cash_t1 = open_cash # print("Do nothing") if t == datasize - 1: # print("t==datasize") done = True next_state_class_obj = State(data_train, Bal_stock1_t1, open_cash_t1, t) next_state_array_obj = next_state_class_obj.getState() else: next_state_class_obj = State(data_train, Bal_stock1_t1, open_cash_t1, t + 1) next_state_array_obj = next_state_class_obj.getState() agent.memory.append( (state_array_obj, action, reward, next_state_array_obj, done)) # print("Action is "+str(action)+" reward is" + str(reward)) Bal_stock1 = Bal_stock1_t1 # Bal_stock2 = Bal_stock2_t1 open_cash = open_cash_t1 if done == True: total_Prof.append(total_profit) total_stock1bal.append(len(agent.inventory1)) # total_stock2bal.append(len(agent.inventory2)) total_open_cash.append(state_class_obj.open_cash) total_port_value.append(state_class_obj.portfolio_value) total_days_played.append(t) print("--------------------------------") state_class_obj.reset() break if len(agent.memory) > batch_size: agent.expReplay(batch_size) print(reward) if reward > max: max = reward agent.model.save("models/model_" + date + "-max") if e % 30 == 0: agent.model.save("models/model_" + date + "-" + str(e)) if path.exists("models/model_" + date + "-max"): model_name = "model_" + date + "-max" else: model_name = "model_" + date + "-" + str(episode_count) return model_name
def evaluate(model_name): time_start = dt.now() model = load_model(model_name) # Load the NN-agent model state_size = model.layers[0].input.shape.as_list()[ 1] # Load the state size from the model window_size = int(state_size / 2) env = Environment(ds_path=ds_path, window_size=window_size, pip_pos=pip_pos, stop_loss=stop_loss_value, trans_cost=trans_cost) actions = env.get_actions( ) # Getting the available actions of the environment actions_size = env.get_actions_n( ) # Getting the number of the actions available into the environment agent = Agent(state_size=state_size, action_size=actions_size, is_eval=True, model_name=model_name) state, reward = env.step( "Hold") # Making a first neutral action for get the first state total_revenue = 0 while not env.done: # Loop until we finish all the instances action = agent.act( state) # The agent choose an action based on the current state next_state, reward = env.step( actions[action] ) # Getting the next state and reward based on the action choose #with open(log, "a+") as file: #file.write(str(actions[action]) + "\n") # Saving the performance on a file #if env.stop_loss_triggered: #file.write("Stop Loss Triggered!" + "\n") # Saving the stop loss taken on a file #file.write(str(reward) + "\n") # Saving the performance on a file '''print(colored("Observation:", 'blue'), state) print(colored("Action:", 'yellow'), actions[action]) if env.stop_loss_triggered: # Alert when we got a stop loss from the environment print(colored('Stop loss triggered!', 'red')) print(colored("Next Observation:", 'blue'), next_state) print(colored("Reward:", 'cyan'), reward)''' total_revenue += reward #agent.memory.append((state, action, reward, next_state)) # Saving the experience state = next_state #if len(agent.memory) > batch_size: # Making an analysis based on our experience # agent.exp_replay(batch_size) # ***************************** Showing and Saving the Results over a Single Episode ******************************* #print("-----------------------------------------------------------------------------------------------------------") if total_revenue > 0: print(colored("Total Profit: ", 'blue'), colored(str(round(total_revenue, 1)), 'cyan'), "pips") else: print(colored("Total Profit: ", 'blue'), colored(str(round(total_revenue, 1)), 'red'), "pips") with open(performance_file_path, "a+") as file: file.write(str(round(total_revenue, 1)) + "\n") # Saving the performance on a file time_stop = dt.now() print(colored("Execution time for this episode:", 'yellow'), round((time_stop - time_start).total_seconds(), 0), "seconds")
for idx in range(length_data): len_buy = len(agent.buy_inventory) len_sell = len(agent.sell_inventory) if len_buy > 40: buy_flag = 1 sell_flag = 0 elif len_sell > 40: buy_flag = 0 sell_flag = 1 else: buy_flag = 0 sell_flag = 0 buy_sell_array = [len_buy, len_sell, buy_flag, sell_flag] action = agent.act(state, buy_sell_array) # TODO idx + 1出なくて良いか? バグの可能性あり。 next_state = getStateFromCsvData(data, idx + 1, window_size) reward = 0 if action == 1 and len(agent.sell_inventory) > 0: i = 0 for i in range(0, int(len(agent.sell_inventory) / 10)): sold_price = agent.sell_inventory.pop(0) profit = sold_price - data[idx] reward += profit # max(profit, 0) total_profit += profit print("Buy(決済): " + formatPrice(data[idx]) + " | Profit: " + formatPrice(profit)) reward = reward / (i + 1)
print("stop loss:", stop_loss_value) print("pc: BH") # ********************************************* Looping over all Episodes ***************-****************************** for ep in range(n_episodes - n_prev_iterations): time_start = dt.now() total_revenue = 0 # Counts the total reward for a single episode print("Iteration: " + str(ep + 1) + "/" + str(n_episodes - n_prev_iterations)) env.reset() # Resetting the environment agent.reset() # Resetting the agent mini-batch memory state, reward = env.step( "Hold") # Making a first neutral action for get the first state # ******************************************* Looping over all Instances ******************************************* while not env.done: # Loop until we finish all the instances action = agent.act( state) # The agent choose an action based on the current state next_state, reward = env.step( actions[action] ) # Getting the next state and reward based on the action choose '''with open(log, "a+") as file: file.write(str(actions[action]) + "\n") # Saving the performance on a file if env.stop_loss_triggered: file.write("Stop Loss Triggered!" + "\n") # Saving the stop loss taken on a file file.write(str(reward) + "\n") # Saving the performance on a file''' '''print(colored("Observation:", 'blue'), state) print(colored("Action:", 'yellow'), actions[action]) if env.stop_loss_triggered: # Alert when we got a stop loss from the environment print(colored('Stop loss triggered!', 'red')) print(colored("Next Observation:", 'blue'), next_state) print(colored("Reward:", 'cyan'), reward)'''