Python Agent.act 예제들, agent.agent.Agent.act Python 예제들

예제 #1

0

파일 보기

파일: evaluate.py 프로젝트: m-rubik/pyTrade-ML

def main(stock_name, model_name):
    # if len(sys.argv) != 3:
    # 	print("Usage: python evaluate.py [stock] [model]")
    # 	exit()

    # stock_name, model_name = sys.argv[1], sys.argv[2]

    model = load_model("models/" + model_name)
    window_size = model.layers[0].input.shape.as_list()[1]

    agent = Agent(window_size, True, model_name)
    data = getStockDataVec(stock_name)
    l = len(data) - 1
    batch_size = 32

    state = getState(data, 0, window_size + 1)
    total_profit = 0
    agent.inventory = []

    for t in range(l):
        action = agent.act(state)

        # sit
        next_state = getState(data, t + 1, window_size + 1)
        reward = 0

        if action == 1:  # buy
            agent.inventory.append(data[t])
            print("Buy: " + formatPrice(data[t]))

        elif action == 2 and len(agent.inventory) > 0:  # sell
            bought_price = agent.inventory.pop(0)
            reward = max(data[t] - bought_price, 0)
            total_profit += data[t] - bought_price
            print("Sell: " + formatPrice(data[t]) + " | Profit: " +
                  formatPrice(data[t] - bought_price))

        done = True if t == l - 1 else False
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state

        if done:
            print("--------------------------------")
            print(stock_name + " Total Profit: " + formatPrice(total_profit))
            print("--------------------------------")

예제 #2

0

파일 보기

def Trainer(stock_name, window_size, episode_count):
    agent = Agent(window_size)
    data = getStockDataVec(stock_name)
    l = len(data) - 1
    batch_size = 32

    for e in range(episode_count + 1):
        print("Episode " + str(e) + "/" + str(episode_count))
        state = getState(data, 0, window_size + 1)

        total_profit = 0
        agent.inventory = []

        for t in range(l):
            action = agent.act(state)

            # sit
            next_state = getState(data, t + 1, window_size + 1)
            reward = 0

            if action == 1: # buy
                agent.inventory.append(data[t])
                print("Buy: " + formatPrice(data[t]))

            elif action == 2 and len(agent.inventory) > 0: # sell
                bought_price = agent.inventory.pop(0)
                reward = max(data[t] - bought_price, 0)
                total_profit += data[t] - bought_price
                print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price))

            done = True if t == l - 1 else False
            agent.memory.append((state, action, reward, next_state, done))
            state = next_state

            if done:
                print("--------------------------------")
                print("Total Profit: " + formatPrice(total_profit))
                print("--------------------------------")

            if len(agent.memory) > batch_size:
                agent.expReplay(batch_size)

        if e % 10 == 0:
            agent.model.save("models/model_ep" + str(e))

예제 #3

0

파일 보기

파일: learning_curve.py 프로젝트: pando85/q-trader

def eval_model(stock_name, model_name):
    # Agent
    window_size = get_window_size(model_name)
    agent = Agent(window_size, True, model_name)

    # Environment
    env = SimpleTradeEnv(stock_name, window_size, agent)

    # Main loop
    state = env.reset()
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state

    return env.total_profit

예제 #4

0

파일 보기

stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int(
    sys.argv[3])

agent = Agent(window_size)
data = getStockDataVec(stock_name)
l = len(data) - 1

for e in range(episode_count + 1):
    print("Episode " + str(e) + "/" + str(episode_count))
    state = getState(data, 0, window_size + 1)

    total_profit = 0
    agent.inventory = []

    for t in range(l):
        action = agent.act(state)

        # sit
        next_state = getState(data, t + 1, window_size + 1)
        reward = 0

        if action == 1:  # buy
            agent.inventory.append(data[t])
            # print("Buy: " + formatPrice(data[t]))

        elif action == 2 and len(agent.inventory) > 0:  # sell
            bought_price = agent.inventory.pop(0)
            reward = max(data[t] - bought_price, 0)
            total_profit += data[t] - bought_price
            # print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price))

예제 #5

0

파일 보기

파일: run_agentNew.py 프로젝트: 23156145525/icodoom

def main():
    ## Simulator
    simulator_args = {}
    simulator_args['config'] = 'config/config.cfg'
    simulator_args['resolution'] = (widthIn,heightIn)
    simulator_args['frame_skip'] = 1
    simulator_args['color_mode'] = 'RGB24'
    simulator_args['game_args'] = "+name ICO +colorset 7"

    ## Agent
    agent_args = {}

    # preprocessing
    preprocess_input_images = lambda x: x / 255. - 0.5
    agent_args['preprocess_input_images'] = lambda x: x / 255. - 0.5
    agent_args['preprocess_input_measurements'] = lambda x: x / 100. - 0.5
    agent_args['num_future_steps'] = 6
    pred_scale_coeffs = np.expand_dims(
        (np.expand_dims(np.array([8., 40., 1.]), 1) * np.ones((1, agent_args['num_future_steps']))).flatten(), 0)
    agent_args['meas_for_net_init'] = range(3)
    agent_args['meas_for_manual_init'] = range(3, 16)
    agent_args['resolution'] = (width,height)
    # just use grayscale for nnet inputs
    agent_args['num_channels'] = 1


    # net parameters
    agent_args['net_type'] = "fc"
#    agent_args['net_type'] = "conv"
    agent_args['conv_params'] = np.array([(16, 5, 4), (32, 3, 2), (64, 3, 2), (128, 3, 2)],
                                         dtype=[('out_channels', int), ('kernel', int), ('stride', int)])
    agent_args['fc_img_params'] = np.array([(128,)], dtype=[('out_dims', int)])
    agent_args['fc_meas_params'] = np.array([(128,), (128,), (128,)], dtype=[('out_dims', int)])
    agent_args['fc_joint_params'] = np.array([(256,), (256,), (-1,)], dtype=[('out_dims', int)])
    agent_args['target_dim'] = agent_args['num_future_steps'] * len(agent_args['meas_for_net_init'])
    agent_args['n_actions'] = 7

    # experiment arguments
    agent_args['test_objective_params'] = (np.array([5, 11, 17]), np.array([1., 1., 1.]))
    agent_args['history_length'] = 3
    agent_args['history_length_ico'] = 3
    historyLen = agent_args['history_length']
    print ("HistoryLen: ", historyLen)

    print('starting simulator')
    simulator = DoomSimulator(simulator_args)
    num_channels = simulator.num_channels

    print('started simulator')

    agent_args['state_imgs_shape'] = (
    historyLen * num_channels, simulator.resolution[1], simulator.resolution[0])

    agent_args['n_ffnet_input'] = (agent_args['resolution'][0]*agent_args['resolution'][1])
    agent_args['n_ffnet_hidden'] = np.array([50,5])
    agent_args['n_ffnet_output'] = 1
    agent_args['n_ffnet_act'] = 7
    agent_args['n_ffnet_meas'] = simulator.num_meas
    agent_args['learning_rate'] = 1E-4

    modelDir = os.path.join(os.path.expanduser("~"), "Dev/GameAI/vizdoom_cig2017/icodoom/ICO1/Models")

    if 'meas_for_net_init' in agent_args:
        agent_args['meas_for_net'] = []
        for ns in range(historyLen):
            agent_args['meas_for_net'] += [i + simulator.num_meas * ns for i in agent_args['meas_for_net_init']]
        agent_args['meas_for_net'] = np.array(agent_args['meas_for_net'])
    else:
        agent_args['meas_for_net'] = np.arange(historyLen * simulator.num_meas)
    if len(agent_args['meas_for_manual_init']) > 0:
        agent_args['meas_for_manual'] = np.array([i + simulator.num_meas * (historyLen - 1) for i in
                                                  agent_args[
                                                      'meas_for_manual_init']])  # current timestep is the last in the stack
    else:
        agent_args['meas_for_manual'] = []

    agent_args['state_meas_shape'] = (len(agent_args['meas_for_net']),)

#    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
#    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False))

#    agent = Agent(sess, agent_args)
#    agent.load('/home/paul/Dev/GameAI/vizdoom_cig2017/icolearner/ICO1/checkpoints/ICO-8600')
#    print("model loaded..")

    #    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
#    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False))

    img_buffer = np.zeros(
        (historyLen, simulator.resolution[1], simulator.resolution[0], num_channels), dtype='uint8')

    meas_buffer = np.zeros((historyLen, simulator.num_meas))
    act_buffer = np.zeros((historyLen, 7))
    act_buffer_ico = np.zeros((agent_args['history_length_ico'], 7))
    curr_step = 0
    old_step = -1
    term = False

    print ("state_meas_shape: ", meas_buffer.shape, " == ", agent_args['state_meas_shape'])
    print ("act_buffer_shape: ", act_buffer.shape)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,log_device_placement=False))
    ag = Agent(sess, agent_args)

    if (os.path.isfile("checkpoints/checkpoint")):
        ag.load('/home/paul/Dev/GameAI/vizdoom_cig2017/icodoom/ICO1/checkpoints/')
        print("model loaded..")
    else:
        print ("No model file, initialising...")


    diff_y = 0
    diff_x = 0
    diff_z = 0
    diff_theta = 0
    iter = 1
    epoch = 200
    radialFlowLeft = 30.
    radialFlowRight = 30.
    radialFlowInertia = 0.4
    radialGain = 4.
    rotationGain = 50.
    errorThresh = 10.
    updatePtsFreq = 50
    skipImage = 1
    skipImageICO = 5
    reflexGain = 1E-4
    flowGain = 0.
    netGain = 10.
    oldHealth = 0.

    # create masks for left and right visual fields - note that these only cover the upper half of the image
    # this is to help prevent the tracking getting confused by the floor pattern
    half_height = round(height/2)
    half_width = round(width/2)

    maskLeft = np.zeros([height, width], np.uint8)
    maskLeft[half_height:, :half_width] = 1.
    maskRight = np.zeros([height, width], np.uint8)
    maskRight[half_height:, half_width:] = 1.

    lk_params = dict(winSize=(15, 15), maxLevel=2, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
    feature_params = dict(maxCorners=500, qualityLevel=0.03, minDistance=7, blockSize=7)

    imgCentre = np.array([int(simulator_args['resolution'][0] / 2), int(simulator_args['resolution'][1] /2)])
    print ("Image centre: ", imgCentre)
    rawInputs = np.zeros((height, width))
    cheatInputs = np.zeros((width, height))
    input_buff = np.zeros((1,width*height))
    target_buff = np.zeros((1,1))
    meas_buff = np.zeros((1,simulator.num_meas))
    netOut = 0.
    netErr = np.zeros((width,height))
    delta = 0.
    shoot = 0

    reflexOn = False
    iter = 0

    while not term:
        if curr_step < historyLen:
            curr_act = np.zeros(7).tolist()
            img, meas, rwrd, term = simulator.step(curr_act)
            print("Image: ", img.shape, " max: ", np.amax(img), " min: ", np.amin(img))

            if curr_step == 0:
                p0Left = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskLeft, **feature_params)
                p0Right = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskRight, **feature_params)

            img_buffer[curr_step % historyLen] = img
            meas_buffer[curr_step % historyLen] = meas
            act_buffer[curr_step % historyLen] = curr_act[:7]

        else:
            img1 = img_buffer[(curr_step-2) % historyLen,:,:,:]
            img2 = img_buffer[(curr_step-1) % historyLen,:,:,:]
            state = simulator._game.get_state()

            stateImg = state.screen_buffer

            if(curr_step % updatePtsFreq == 0):
                p0Left = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskLeft, **feature_params)
                p0Right = cv2.goodFeaturesToTrack(img[:,:,0], mask=maskRight, **feature_params)

            p1Left, st, err = cv2.calcOpticalFlowPyrLK(img1[:,:,0], img2[:,:,0], p0Left, None, **lk_params)
            p1Right, st, err = cv2.calcOpticalFlowPyrLK(img1[:,:,0], img2[:,:,0], p0Right, None, **lk_params)
            flowLeft = (p1Left - p0Left)[:,0,:]
            flowRight = (p1Right - p0Right)[:,0,:]
            radialFlowTmpLeft = 0
            radialFlowTmpRight = 0

            for i in range(0, len(p0Left)):
                radialFlowTmpLeft += ((p0Left[i,0,:] - imgCentre)).dot(flowLeft[i,:]) / float(len(p0Left))
            for i in range(0, len(p0Right)):
                radialFlowTmpRight += ((p0Right[i,0,:] - imgCentre)).dot(flowRight[i,:]) / float(len(p0Right))

            rotation = act_buffer[(curr_step - 1) % historyLen][6]
            forward = act_buffer[(curr_step - 1) % historyLen][3]
            # keep separate radial errors for left and right fields
            radialFlowLeft = radialFlowLeft + radialFlowInertia * (radialFlowTmpLeft - radialFlowLeft)
            radialFlowRight = radialFlowRight + radialFlowInertia * (radialFlowTmpRight - radialFlowRight)
            expectFlowLeft = radialGain * forward + (rotationGain * rotation if rotation < 0. else 0.)
            expectFlowRight = radialGain * forward - (rotationGain * rotation if rotation > 0. else 0.)

            flowErrorLeft = forward * (expectFlowLeft - radialFlowLeft) / (1. + rotationGain * np.abs(rotation))
            flowErrorRight = forward * (expectFlowRight - radialFlowRight) / (1. + rotationGain * np.abs(rotation))
            flowErrorLeft = flowErrorLeft if flowErrorLeft > 0. else 0.
            flowErrorRight = flowErrorRight if flowErrorRight > 0. else 0.
            icoSteer = 0.

            if curr_step > 100:
                health = meas[1]

                if (health<0.1):
                    reflexOn = False
                    iter = 0


                # Don't run any networks when the player is dead!
                if (health < 101. and health > 0.):

                    icoInSteer = flowGain * ((flowErrorRight - errorThresh) if (flowErrorRight - errorThresh) > 0. else 0. -
                    flowGain * (flowErrorLeft - errorThresh) if (flowErrorLeft - errorThresh) > 0. else 0. )

                    centre, bottomLeft, topRight, colourStrength = getMaxColourPos(stateImg, [255, 0, 0])
                    colourSteer = imgCentre[0]
                    cheatInputs = stateImg*1.

                    if(len(bottomLeft)>0 and len(topRight)>0 and ((topRight[0] - bottomLeft[0]) < width/3) and ((topRight[1] - bottomLeft[1]) < height/2)):
                        colourSteer = bottomLeft[0] + int(0.5 * (topRight[0] - bottomLeft[0]))
#                        cv2.imwrite("/home/paul/tmp/Backup/rect-" + str(curr_step) + ".jpg", cheatInputs)

                    cv2.arrowedLine(cheatInputs, (colourSteer, imgCentre[1]+10), (colourSteer, imgCentre[1]), color=(255,255,255), thickness=2)

                    rawInputs = np.array(np.sum(stateImg, axis=2) / 3)
                    cheatInputs = np.array(np.sum(cheatInputs, axis=2) / 3)
#                    cv2.imwrite("/home/paul/tmp/Backup/cheat-" + str(curr_step) + ".jpg", cheatInputs)

                    input_buff[0,:] = np.ndarray.flatten(cheatInputs)
                    input_buff = input_buff - np.mean(input_buff)
                    input_buff = input_buff / np.sqrt(np.var(input_buff))

                    # we want the reflex to be delayed wrt to the image input, so that the image is. Otherwise the learning can
                    # never reduce the error to zero no matter how good the controller.
                    if (iter>2):
                        delta = (float(colourSteer) - float(imgCentre[0]))/float(width)
                    else:
                        delta = 0

                    if(iter>2):
                        if(np.abs(delta) < 0.01):
                            shoot = 1

                    target_buff[...] = delta + netOut
#                    target_buff[...] = delta

#                    target_buff[...] = 0.2
                    meas_buff[0,:] = meas

                    ag.act(input_buff, meas, target_buff)
                    if(ag.net_type == 'conv'):
                        netOut = np.ndarray.flatten(ag.ext_covnet_output)[0].flatten()[0]
                    elif(ag.net_type == 'fc'):
                        netOut = np.ndarray.flatten(ag.ext_fcnet_output)[0].flatten()[0]

                    print (" *** ", delta, delta + netOut, netGain*netOut, ag.learning_rate)

                    diff_theta = 0.6 * max(min((icoInSteer), 5.), -5.)

                    netErr[:,:] = 0.
                    diff_theta = diff_theta + reflexGain * colourStrength * delta

                    curr_act = np.zeros(7).tolist()
                    curr_act[0] = 0
                    curr_act[1] = 0
                    curr_act[2] = 0 #shoot
                    curr_act[3] = curr_act[3] + diff_z
                    curr_act[4] = 0
                    curr_act[5] = 0.
                    curr_act[6] = diff_theta + netGain*netOut

                    iter += 1


            if (curr_step % epoch == 0):
                ag.save('/home/paul/Dev/GameAI/vizdoom_cig2017/icodoom/ICO1/checkpoints/BP', curr_step)

            img, meas, rwrd, term = simulator.step(curr_act)
            if (not (meas is None)) and meas[0] > 30.:
                meas[0] = 30.

            if not term:
                img_buffer[curr_step % historyLen] = img
                meas_buffer[curr_step % historyLen] = meas
                act_buffer[curr_step % historyLen] = curr_act[:7]
        curr_step += 1


    simulator.close_game()

예제 #6

0

파일 보기

파일: evaluate.py 프로젝트: SAISRIHARSHAS/Reinforcement_Learning_for_Stock_Prediction

stock_name, model_name = sys.argv[1], sys.argv[2]
model = load_model("models/" + model_name)
window_size = model.layers[0].input.shape.as_list()[1]

agent = Agent(window_size, True, model_name)
data = getStockDataVec(stock_name)
l = len(data) - 1
batch_size = 32

state = getState(data, 0, window_size + 1)
total_profit = 0
agent.inventory = []

for t in xrange(l):
	action = agent.act(state)

	# sit
	next_state = getState(data, t + 1, window_size + 1)
	reward = 0

	if action == 1: # buy
		agent.inventory.append(data[t])
		print "Buy: " + formatPrice(data[t])

	elif action == 2 and len(agent.inventory) > 0: # sell
		bought_price = agent.inventory.pop(0)
		reward = max(data[t] - bought_price, 0)
		total_profit += data[t] - bought_price
		print "Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price)

예제 #7

0

파일 보기

파일: test.py 프로젝트: shikharbhardwaj/tradegame

# Set up the environment
portfolio = Portfolio(start_cash, trade_size, price_iter)
env = Environment(pairs, state_iter, portfolio)

state_shape = env.state().shape

agent = Agent(state_shape[0], is_eval=True, model_location=model_location)

num_steps = 1

metrics = pd.DataFrame(columns=['tick', 'action', 'value'])

while True:
    tick = env.current_tick[0]
    cur_state = env.state()
    action = agent.act(cur_state, env.valid_actions())

    try:
        reward = env.execute(action)
        value = env.portfolio.valueAtTime(env.current_tick[0])
        metrics = metrics.append(
            {
                'tick': tick,
                'action': action,
                'value': value
            },
            ignore_index=True)
    except StopIteration:
        print("Evaluation ended after processing", num_steps - 1, "ticks")
        print(str(env))
        print(str(agent))

예제 #8

0

파일 보기

파일: test.py 프로젝트: shikharbhardwaj/tradegame

# Visualizing the learning
# _, ax = plt.subplots()

agent = Agent(state_shape[0],
              is_eval=True,
              actor_model_location=actor_model_location,
              critic_model_location=critic_model_location)

num_steps = 1

metrics = pd.DataFrame(columns=['tick', 'action', 'value'])

while True:
    tick = env.current_tick[0]
    cur_state = env.state()
    action = agent.act(cur_state)

    try:
        reward = env.execute(action)
        value = env.portfolio.valueAtTime(env.current_tick[0])
        metrics = metrics.append(
            {
                'tick': tick,
                'action': action,
                'value': value
            },
            ignore_index=True)
    except StopIteration:
        print("Training ended after processing", num_steps - 1, "ticks")
        print(str(env))
        print(str(agent))

예제 #9

0

파일 보기

파일: train.py 프로젝트: lalame888/Project

hold_size = 0
for e in range(episode_count + 1):
    print("Episode " + str(e) + "/" + str(episode_count))
    maxd = data[0]
    mind = data[0]
    history = []
    pp = np.mean(history) if history else float('nan')
    state = getState(data, 0, window_size, maxd, mind, len(agent.inventory),
                     agent.money / 1000000, pp)
    total_profit = 0
    agent.money = 1000000
    agent.inventory = []

    for t in range(l):
        history.append(data[t])
        action = agent.act(state, data[t])
        # print(action)
        if data[t] > maxd:
            maxd = data[t]
        elif data[t] < mind:
            mind = data[t]
        # sit
        pp = np.mean(history) if history else float('nan')
        next_state = getState(data, t + 1, window_size, maxd, mind,
                              len(agent.inventory), agent.money / 1000000, pp)
        agent.inventory.sort()
        reward = 0
        if action == 1:
            #print(np.mean(history))
            if len(agent.inventory) == 0:
                reward += 10

예제 #10

0

파일 보기

파일: run_agent.py 프로젝트: flyers/vizdoom_cig2017

def main():

    ## Simulator
    simulator_args = {}
    simulator_args['config'] = 'config/config.cfg'
    simulator_args['resolution'] = (160, 120)
    simulator_args['frame_skip'] = 2
    simulator_args['color_mode'] = 'GRAY'
    simulator_args['game_args'] = "+name IntelAct +colorset 7"

    ## Agent
    agent_args = {}

    # preprocessing
    agent_args['preprocess_input_images'] = lambda x: x / 255. - 0.5
    agent_args['preprocess_input_measurements'] = lambda x: x / 100. - 0.5
    agent_args['num_future_steps'] = 6
    pred_scale_coeffs = np.expand_dims(
        (np.expand_dims(np.array([8., 40., 1.]), 1) * np.ones(
            (1, agent_args['num_future_steps']))).flatten(), 0)
    agent_args['postprocess_predictions'] = lambda x: x * pred_scale_coeffs
    agent_args['discrete_controls_manual'] = range(6, 12)
    agent_args['meas_for_net_init'] = range(3)
    agent_args['meas_for_manual_init'] = range(3, 16)
    agent_args['opposite_button_pairs'] = [(0, 1), (2, 3)]

    # net parameters
    agent_args['conv_params'] = np.array([(16, 5, 4), (32, 3, 2), (64, 3, 2),
                                          (128, 3, 2)],
                                         dtype=[('out_channels', int),
                                                ('kernel', int),
                                                ('stride', int)])
    agent_args['fc_img_params'] = np.array([(128, )],
                                           dtype=[('out_dims', int)])
    agent_args['fc_meas_params'] = np.array([(128, ), (128, ), (128, )],
                                            dtype=[('out_dims', int)])
    agent_args['fc_joint_params'] = np.array([(256, ), (256, ), (-1, )],
                                             dtype=[('out_dims', int)])
    agent_args['target_dim'] = agent_args['num_future_steps'] * len(
        agent_args['meas_for_net_init'])

    # experiment arguments
    agent_args['test_objective_params'] = (np.array([5, 11, 17]),
                                           np.array([1., 1., 1.]))
    agent_args['history_length'] = 1
    agent_args['test_checkpoint'] = 'model'

    print('starting simulator')

    simulator = DoomSimulator(simulator_args)

    print('started simulator')

    agent_args['discrete_controls'] = simulator.discrete_controls
    agent_args['continuous_controls'] = simulator.continuous_controls
    agent_args['state_imgs_shape'] = (agent_args['history_length'] *
                                      simulator.num_channels,
                                      simulator.resolution[1],
                                      simulator.resolution[0])
    if 'meas_for_net_init' in agent_args:
        agent_args['meas_for_net'] = []
        for ns in range(agent_args['history_length']):
            agent_args['meas_for_net'] += [
                i + simulator.num_meas * ns
                for i in agent_args['meas_for_net_init']
            ]
        agent_args['meas_for_net'] = np.array(agent_args['meas_for_net'])
    else:
        agent_args['meas_for_net'] = np.arange(agent_args['history_length'] *
                                               simulator.num_meas)
    if len(agent_args['meas_for_manual_init']) > 0:
        agent_args['meas_for_manual'] = np.array([
            i + simulator.num_meas * (agent_args['history_length'] - 1)
            for i in agent_args['meas_for_manual_init']
        ])  # current timestep is the last in the stack
    else:
        agent_args['meas_for_manual'] = []
    agent_args['state_meas_shape'] = (len(agent_args['meas_for_net']), )

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                            log_device_placement=False))
    ag = Agent(sess, agent_args)
    ag.load('./checkpoints')

    img_buffer = np.zeros(
        (agent_args['history_length'], simulator.num_channels,
         simulator.resolution[1], simulator.resolution[0]))
    meas_buffer = np.zeros((agent_args['history_length'], simulator.num_meas))
    curr_step = 0
    term = False

    acts_to_replace = [
        a + b + d + e for a in [[0, 0], [1, 1]] for b in [[0, 0], [1, 1]]
        for d in [[0]] for e in [[0], [1]]
    ]
    print(acts_to_replace)
    replacement_act = [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
    #MOVE_FORWARD   MOVE_BACKWARD   TURN_LEFT   TURN_RIGHT  ATTACK  SPEED   SELECT_WEAPON2  SELECT_WEAPON3  SELECT_WEAPON4  SELECT_WEAPON5  SELECT_WEAPON6  SELECT_WEAPON7

    while not term:
        if curr_step < agent_args['history_length']:
            img, meas, rwrd, term = simulator.step(
                np.squeeze(ag.random_actions(1)).tolist())
        else:
            state_imgs = np.transpose(
                np.reshape(
                    img_buffer[np.arange(
                        curr_step - agent_args['history_length'], curr_step) %
                               agent_args['history_length']],
                    (1, ) + agent_args['state_imgs_shape']), [0, 2, 3, 1])
            state_meas = np.reshape(
                meas_buffer[np.arange(curr_step - agent_args['history_length'],
                                      curr_step) %
                            agent_args['history_length']],
                (1, agent_args['history_length'] * simulator.num_meas))

            curr_act = np.squeeze(
                ag.act(state_imgs, state_meas,
                       agent_args['test_objective_params'])[0]).tolist()
            if curr_act[:6] in acts_to_replace:
                curr_act = replacement_act
            img, meas, rwrd, term = simulator.step(curr_act)
            if (not (meas is None)) and meas[0] > 30.:
                meas[0] = 30.

        if not term:
            img_buffer[curr_step % agent_args['history_length']] = img
            meas_buffer[curr_step % agent_args['history_length']] = meas
            curr_step += 1

    simulator.close_game()

예제 #11

0

파일 보기

def train():
    profits_list = [
    ]  # Will hold list of all profits as we go through training

    # Given command line input as below

    # if len(sys.argv) != 4:
    #     print("Usage: python train.py [stock] [window] [episodes]")
    #     exit()

    with open(os.path.join(os.path.dirname(__file__), 'config.yml'),
              'r') as stream:
        config = yaml.load(stream)

    # Unpackage data from terminal/config
    # stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int(sys.argv[3])
    stock_name, window_size, episode_count = config['stock_name'], config[
        'window_size'], config["num_epochs"]

    num_tech_indicators = config['num_tech_indicators']
    agent = Agent(window_size + num_tech_indicators, config)
    data = getStockDataVec(stock_name)
    env = TradingEnv(data, window_size)
    l = len(data) - 1

    for e in range(episode_count + 1):
        print("Episode " + str(e) + "/" + str(episode_count))
        state = env.get_state(0)

        env.reset_holdings()

        for t in range(l):
            action = agent.act(state)

            # sit
            next_state = env.get_state(t + 1)
            reward = 0

            if action == 1:  # buy
                #remembers the price bought at t, and the time bought
                env.buy(t)
                # print("Buy: " + formatPrice(data[t]))

            elif action == 2:  # sell
                reward, profit = env.sell(t)
                # print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(profit))

            done = True if t == l - 1 else False
            # Push all values to memory
            agent.memory.push(state, action, next_state, reward)
            state = next_state
            total_profit = env.net_profit(t)
            max_staked = env.max_spent

            if done:
                percent_return = total_profit / max_staked * 100
                print("--------------------------------")
                print("Total Profit: " + formatPrice(total_profit))
                print("Max staked: " + formatPrice(max_staked))
                print("Percent return: " + "{0:.2f}%".format(percent_return))
                print("--------------------------------")
                profits_list.append((total_profit, percent_return))
                # print(profits_list)
            agent.optimize()

        if e % config['save_freq'] == 0:
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
            torch.save(agent.policy_net, config['policy_model'])
            torch.save(agent.target_net, config['target_model'])

예제 #12

0

파일 보기

파일: functions.py 프로젝트: akshay111678/reinforcement_learning_bot

def test_model(episode_count, data_test, data_test_open, start_balance,
               model_name):
    # Define arrays to store per episode values
    Act_datasize = len(data_test)
    Act_Bench_Stock1_Bal = int(
        np.floor((start_balance / 2) / data_test_open[0]))
    Act_Bench_Open_cash = start_balance / 2
    model = load_model("models/" + model_name)
    # Actual run
    episode_count = 0
    # Define arrays to store per episode values
    total_Prof = []
    total_stock1bal = []
    total_open_cash = []
    total_port_value = []
    total_days_played = []
    Act_total_Prof = []
    Act_total_stock1bal = []
    Act_total_open_cash = []
    Act_total_port_value = []
    Act_total_days_played = []
    actions_done_perday = []
    portfolio_value = []
    for e in range(1):  # here we run only for 1 episode, as it is Test run
        Bal_stock1_t2 = Act_Bench_Stock1_Bal
        done = False
        open_cash_t2 = Act_Bench_Open_cash
        total_profit = 0
        reward = 0

        # Initialize Agent
        agent_test = Agent(8, is_eval=True, model_name=model_name)
        # agent = Agent(8)

        agent_test.inventory1 = []
        for i in range(Bal_stock1_t2):
            agent_test.inventory1.append(data_test_open[0])
            # Timestep delta to make sure that with time reward increases for taking action
        timestep_delta = 0

        # Running episode over all days in the datasize
        for t in range(Act_datasize):
            print("..........")

            print(data_test.iloc[t, 0])
            state_class_obj = State(data_test_open, Bal_stock1_t2,
                                    open_cash_t2, t)
            state_array_obj = state_class_obj.getState()
            action = agent_test.act(state_array_obj)

            print("Total portfolio value: " +
                  str(state_class_obj.portfolio_value) + "  stock 1 number: " +
                  str(len(agent_test.inventory1)) + "  open cash" +
                  str(state_class_obj.open_cash))

            # reward should be more as time goes further. We will remove reward_timedelta from actual reward
            # reward_timedelta=(datasize-t)*timestep_delta

            change_percent_stock1 = (state_class_obj.Stock1Price -
                                     state_class_obj.fiveday_stock1
                                     ) / state_class_obj.fiveday_stock1 * 100

            # print("change_percent_stock1:  "+str(change_percent_stock1))
            # print("change_percent_stock2:  "+str(change_percent_stock2))
            if action == 0:  # buy stock 1
                if state_class_obj.Stock1Price > state_class_obj.open_cash:
                    '''
                    print("Buy stock 1 when it did not have cash, so bankrupt, end of episode")
                    reward=-reward_timedelta*10
                    done = True
                    '''
                    done = True
                    # end episode

                else:
                    # print("In Buy stock 1")
                    agent_test.inventory1.append(data_test_open[t])
                    Bal_stock1_t2 = len(agent_test.inventory1)
                    open_cash_t2 = state_class_obj.open_cash - state_class_obj.Stock1Price  # Here we are buying 1 stock

            if action == 1:  # sell stock 1
                if state_class_obj.Stock1Blnc < 1:
                    # print("sold stock 2 when it did not have stock 2, so bankrupt, end of episode")

                    done = True
                    # end episode
                else:
                    # print("In sell stock 1")
                    agent_test.inventory1.pop(0)

                    Bal_stock1_t2 = len(agent_test.inventory1)
                    # Bal_stock2_t2 = len(agent_test.inventory2)
                    open_cash_t2 = state_class_obj.open_cash + state_class_obj.Stock1Price  # State[0] is the price of stock 1. Here we are buying 1 stoc

            if action == 2:  # Do nothing action
                Bal_stock1_t2 = len(agent_test.inventory1)
                # Bal_stock2_t2 = len(agent_test.inventory2)
            # print("Do nothing")

            if t == Act_datasize - 1:
                # print("t==datasize")
                done = True
                next_state_class_obj = State(data_test_open, Bal_stock1_t2,
                                             open_cash_t2, t)
                next_state_array_obj = next_state_class_obj.getState()
            else:
                # print("t!=datasize"+str(open_cash_t2))
                next_state_class_obj = State(data_test_open, Bal_stock1_t2,
                                             open_cash_t2, t + 1)
                next_state_array_obj = next_state_class_obj.getState()

            # print("Action is "+str(action)+" reward is" + str(reward))

            actions_done_perday.append(action)
            portfolio_value.append(next_state_class_obj.portfolio_value)

            if done == True:
                print("--------------------------------")
                print("Total Profit: " +
                      formatPrice(next_state_class_obj.portfolio_value -
                                  start_balance))
                print("Total No. of days played: " + str(t) +
                      "  out of overall days:  " + str(Act_datasize))
                print("Total portfolio value: " +
                      str(next_state_class_obj.portfolio_value) +
                      "  stock 1 number: " + str(len(agent_test.inventory1)) +
                      "  open cash" + str(next_state_class_obj.open_cash))
                # + "  stock 2 number: " + str(len(agent_test.inventory2))

                Act_total_Prof.append(total_profit)
                Act_total_stock1bal.append(len(agent_test.inventory1))
                # Act_total_stock2bal.append(len(agent_test.inventory2))
                Act_total_open_cash.append(state_class_obj.open_cash)
                Act_total_port_value.append(state_class_obj.portfolio_value)
                Act_total_days_played.append(t)

                print("--------------------------------")
                state_class_obj.reset()
                break
    opencash = state_class_obj.open_cash

    return total_profit, portfolio_value, opencash, Act_total_days_played

예제 #13

0

파일 보기

파일: functions.py 프로젝트: akshay111678/reinforcement_learning_bot

def train_model(episode_count, start_balance, data_train, training, date):
    from os import path
    # Define arrays to store per episode values
    total_Prof = []
    total_stock1bal = []
    total_open_cash = []
    total_port_value = []
    total_days_played = []
    batch_size = 64
    # Training run
    for e in range(episode_count + 1):
        print("..........")
        print("Episode " + str(e) + "/" + str(episode_count))

        Bal_stock1 = int(np.floor((start_balance / 2) / data_train[0]))
        open_cash = start_balance / 2

        datasize = training
        done = False
        total_profit = 0
        reward = 0
        max = 0

        # Initialize Agent
        agent = Agent(5)
        agent.inventory1 = []
        for i in range(Bal_stock1):
            agent.inventory1.append(data_train[0])
        # Timestep delta to make sure that with time reward increases for taking action
        # timestep_delta=0
        # Running episode over all days in the datasize
        for t in range(datasize):
            # print("..........")
            # print(pd_data1_train.iloc[t,0])
            state_class_obj = State(data_train, Bal_stock1, open_cash, t)
            state_array_obj = state_class_obj.getState()
            action = agent.act(state_array_obj)

            change_percent_stock1 = (state_class_obj.Stock1Price -
                                     state_class_obj.fiveday_stock1
                                     ) / state_class_obj.fiveday_stock1 * 100
            # profit=data1_train[t]-agent.inventory1(-1)
            # print("change_percent_stock1:  "+str(change_percent_stock1))

            # if action not in [0,1,2]:
            #     reward= reward-1000
            # decide_reward(action,data_train)
            if action == 0:  # buy stock 1
                if state_class_obj.Stock1Price > state_class_obj.open_cash:
                    '''
                    print("Buy stock 1 when it did not have cash, so bankrupt, end of episode")
                    reward=-reward_timedelta*10
                    done = True
                    '''

                    reward = reward - 4000
                    # done = True
                    # end episode

                else:
                    # print("In Buy stock 1")
                    agent.inventory1.append(data_train[t])
                    Bal_stock1_t1 = len(agent.inventory1)
                    # Bal_stock2_t1 = len(agent.inventory2)
                    open_cash_t1 = state_class_obj.open_cash - state_class_obj.Stock1Price  # Here we are buying 1 stock

                    # needs to be reviewed

                    if (state_class_obj.open_cash < 500):
                        reward = reward - 2000
                    elif (0.1 * Bal_stock1_t1 > Bal_stock1):
                        reward = reward - (1000 * Bal_stock1_t1)
                    # elif (abs(change_percent_stock1) <= 2):
                    #     reward = reward-2000
                    else:
                        reward = reward - (change_percent_stock1 * 1000)

            if action == 1:  # sell stock 1
                if state_class_obj.Stock1Blnc < 1:
                    # print("sold stock 2 when it did not have stock 2, so bankrupt, end of episode")
                    reward = reward - 4000
                    # done = True
                    # end episode
                else:
                    # print("In sell stock 1")
                    bought_price1 = agent.inventory1.pop(0)
                    Bal_stock1_t1 = len(agent.inventory1)
                    total_profit += data_train[t] - bought_price1
                    # Bal_stock2_t1 = len(agent.inventory2)
                    open_cash_t1 = state_class_obj.open_cash + state_class_obj.Stock1Price  # State[0] is the price of stock 1. Here we are selling 1 stoc

                    if (0.1 * Bal_stock1_t1 > Bal_stock1):
                        reward = reward - (1000 * Bal_stock1_t1)
                    # elif (abs(change_percent_stock1) <= 2):
                    #     reward = -1000
                    elif total_profit > 200:
                        reward = reward + (2000 * total_profit)
                    else:
                        reward = reward + (
                            change_percent_stock1 * 100
                        )  # State[0] is the price of stock 1. Here we are selling 1 stock

                    # total_profit += data1_train[t] - bought_price1
                # print("reward for sell stock1 " + str(reward))

            if action == 2:  # Do nothing action
                # if (abs(change_percent_stock1) <= 2):
                #     reward = 100
                if (state_class_obj.open_cash < 0.05 * start_balance):
                    reward += 2000
                else:
                    reward = reward - 2000

                Bal_stock1_t1 = len(agent.inventory1)
                # Bal_stock2_t1 = len(agent.inventory2)
                open_cash_t1 = open_cash
            # print("Do nothing")

            if t == datasize - 1:
                # print("t==datasize")
                done = True
                next_state_class_obj = State(data_train, Bal_stock1_t1,
                                             open_cash_t1, t)
                next_state_array_obj = next_state_class_obj.getState()
            else:
                next_state_class_obj = State(data_train, Bal_stock1_t1,
                                             open_cash_t1, t + 1)
                next_state_array_obj = next_state_class_obj.getState()

            agent.memory.append(
                (state_array_obj, action, reward, next_state_array_obj, done))
            # print("Action is "+str(action)+" reward is" + str(reward))

            Bal_stock1 = Bal_stock1_t1
            # Bal_stock2 = Bal_stock2_t1
            open_cash = open_cash_t1

            if done == True:
                total_Prof.append(total_profit)
                total_stock1bal.append(len(agent.inventory1))
                # total_stock2bal.append(len(agent.inventory2))
                total_open_cash.append(state_class_obj.open_cash)
                total_port_value.append(state_class_obj.portfolio_value)
                total_days_played.append(t)
                print("--------------------------------")
                state_class_obj.reset()
                break

            if len(agent.memory) > batch_size:
                agent.expReplay(batch_size)
        print(reward)
        if reward > max:
            max = reward
            agent.model.save("models/model_" + date + "-max")

        if e % 30 == 0:
            agent.model.save("models/model_" + date + "-" + str(e))
    if path.exists("models/model_" + date + "-max"):
        model_name = "model_" + date + "-max"
    else:
        model_name = "model_" + date + "-" + str(episode_count)
    return model_name

예제 #14

0

파일 보기

파일: testing.py 프로젝트: Dracy88/Reinforcement-Learning

def evaluate(model_name):
    time_start = dt.now()

    model = load_model(model_name)  # Load the NN-agent model
    state_size = model.layers[0].input.shape.as_list()[
        1]  # Load the state size from the model
    window_size = int(state_size / 2)
    env = Environment(ds_path=ds_path,
                      window_size=window_size,
                      pip_pos=pip_pos,
                      stop_loss=stop_loss_value,
                      trans_cost=trans_cost)
    actions = env.get_actions(
    )  # Getting the available actions of the environment
    actions_size = env.get_actions_n(
    )  # Getting the number of the actions available into the environment

    agent = Agent(state_size=state_size,
                  action_size=actions_size,
                  is_eval=True,
                  model_name=model_name)

    state, reward = env.step(
        "Hold")  # Making a first neutral action for get the first state
    total_revenue = 0

    while not env.done:  # Loop until we finish all the instances

        action = agent.act(
            state)  # The agent choose an action based on the current state
        next_state, reward = env.step(
            actions[action]
        )  # Getting the next state and reward based on the action choose
        #with open(log, "a+") as file:
        #file.write(str(actions[action]) + "\n")  # Saving the performance on a file
        #if env.stop_loss_triggered:
        #file.write("Stop Loss Triggered!" + "\n")  # Saving the stop loss taken on a file
        #file.write(str(reward) + "\n")  # Saving the performance on a file
        '''print(colored("Observation:", 'blue'), state)
		print(colored("Action:", 'yellow'), actions[action])
		if env.stop_loss_triggered:  # Alert when we got a stop loss from the environment
			print(colored('Stop loss triggered!', 'red'))
		print(colored("Next Observation:", 'blue'), next_state)
		print(colored("Reward:", 'cyan'), reward)'''

        total_revenue += reward

        #agent.memory.append((state, action, reward, next_state))  # Saving the experience
        state = next_state

        #if len(agent.memory) > batch_size:  # Making an analysis based on our experience
        #	agent.exp_replay(batch_size)

    # ***************************** Showing and Saving the Results over a Single Episode *******************************
    #print("-----------------------------------------------------------------------------------------------------------")
    if total_revenue > 0:
        print(colored("Total Profit: ", 'blue'),
              colored(str(round(total_revenue, 1)), 'cyan'), "pips")
    else:
        print(colored("Total Profit: ", 'blue'),
              colored(str(round(total_revenue, 1)), 'red'), "pips")
    with open(performance_file_path, "a+") as file:
        file.write(str(round(total_revenue, 1)) +
                   "\n")  # Saving the performance on a file
    time_stop = dt.now()
    print(colored("Execution time for this episode:", 'yellow'),
          round((time_stop - time_start).total_seconds(), 0), "seconds")

예제 #15

0

파일 보기

    for idx in range(length_data):
        len_buy = len(agent.buy_inventory)
        len_sell = len(agent.sell_inventory)
        if len_buy > 40:
            buy_flag = 1
            sell_flag = 0
        elif len_sell > 40:
            buy_flag = 0
            sell_flag = 1
        else:
            buy_flag = 0
            sell_flag = 0

        buy_sell_array = [len_buy, len_sell, buy_flag, sell_flag]

        action = agent.act(state, buy_sell_array)

        # TODO idx + 1出なくて良いか？　バグの可能性あり。
        next_state = getStateFromCsvData(data, idx + 1, window_size)
        reward = 0

        if action == 1 and len(agent.sell_inventory) > 0:
            i = 0
            for i in range(0, int(len(agent.sell_inventory) / 10)):
                sold_price = agent.sell_inventory.pop(0)
                profit = sold_price - data[idx]
                reward += profit  # max(profit, 0)
                total_profit += profit
                print("Buy(決済): " + formatPrice(data[idx]) + " | Profit: " +
                      formatPrice(profit))
            reward = reward / (i + 1)

예제 #16

0

파일 보기

print("stop loss:", stop_loss_value)
print("pc: BH")
# ********************************************* Looping over all Episodes ***************-******************************
for ep in range(n_episodes - n_prev_iterations):
    time_start = dt.now()
    total_revenue = 0  # Counts the total reward for a single episode
    print("Iteration: " + str(ep + 1) + "/" +
          str(n_episodes - n_prev_iterations))
    env.reset()  # Resetting the environment
    agent.reset()  # Resetting the agent mini-batch memory
    state, reward = env.step(
        "Hold")  # Making a first neutral action for get the first state

    # ******************************************* Looping over all Instances *******************************************
    while not env.done:  # Loop until we finish all the instances
        action = agent.act(
            state)  # The agent choose an action based on the current state
        next_state, reward = env.step(
            actions[action]
        )  # Getting the next state and reward based on the action choose
        '''with open(log, "a+") as file:
            file.write(str(actions[action]) + "\n")  # Saving the performance on a file
            if env.stop_loss_triggered:
                file.write("Stop Loss Triggered!" + "\n")  # Saving the stop loss taken on a file
            file.write(str(reward) + "\n")  # Saving the performance on a file'''
        '''print(colored("Observation:", 'blue'), state)
        print(colored("Action:", 'yellow'), actions[action])
        if env.stop_loss_triggered:  # Alert when we got a stop loss from the environment
            print(colored('Stop loss triggered!', 'red'))
        print(colored("Next Observation:", 'blue'), next_state)
        print(colored("Reward:", 'cyan'), reward)'''