예제 #1
0
  def process(self):
    Agent.process(self)

    postponed = self._postponed_messages[:]
    self._postponed_messages = list()
    for m in postponed:
      self._process_message(m)
예제 #2
0
def main():
	"""
	来源和目标的默认设置,可以自己修改。
	"""

	agent=Agent()

	#从新闻网站获取新闻的SimpleWebSource:
	url='http://news.163.com/'
	sourcename='网易'
	starttag='<!-- 头条区 -->'
	endtag='<div class="ns-wnews ns-recommand mb30" id="nsRecForYou"></div>'
	subtag='target="_blank"'
	titlePattern=r'<a(.*?)href="(http://.*?\.163\.com.*?)">(.*?)</a>'
	contentPattern=r'''
	<h1 id="h1title" class="ep-h1">(.*?)</h1>
	[\s\S]*?
	<div class="ep-time-soure cDGray">(.*?) 来源
	[\s\S]*?
	<div id="endText" class="end-text">
	([\s\S]*?)
	本文来源:(.*?)</span>
	'''
	netease=NeteaseSource(url,sourcename,starttag,
			endtag,subtag,titlePattern,contentPattern)
	#增加纯文本目标和HTML目标
	agent.addSource(netease)

	#发布新闻项目
	agent.distribute()
예제 #3
0
def test_startup_and_shutdown():
    # Create an agent that throws an exception when it receives
    # a payload command packet.
    a = Agent()
    a.bind_udp_sockets()
    a.service_handler["Payload Command"] = Agent.raise_exception

    # Run agent.
    t = threading.Thread(target=Agent.run, args=(a,))
    t.daemon = True
    t.start()

    # Send an ACK packet
    p = Packet()
    p.service = Supernova.service_id("Payload Command")
    p.dest_node = Supernova.get_my_id()
    p.ack = 1
    Send.send_to_self(p)   

    # Wait for and then assert that thread has *not* exited.
    t.join(0.01)
    assert t.is_alive()

    # Send a payload command packet -- SHUTDOWN
    p = Packet()
    p.service = Supernova.service_id("Payload Command")
    p.dest_node = Supernova.get_my_id()
    Send.send_to_self(p)   

    # Wait for and then assert that thread has exited.
    t.join(0.01)
    assert not t.is_alive()
예제 #4
0
    def solve(system,initV = None, gamma = 0.9):
        numNodes = system.network.numNodes
        numTrt = Agent.numTrt(system)
        numValidTrt = Agent.numValidTrt(numNodes,numTrt)

        if initV is None:
            initV = np.zeros((1 << numNodes,))


        it = 0
        maxIt = 1000
        tol = 1e-8
        cont = True

        v0 = initV

        while cont:
            v1 = ValueIteration.operT(system,gamma,v0)

            it += 1

            if np.linalg.norm(v1 - v0,2) < tol or it == maxIt:
                cont = False

            v0 = v1

        if it == maxIt:
            raise ValueError("ValueIteration hit iteration limit")

        return v0
예제 #5
0
def test_timeout():
    # Create an agent that throws an exception when it receives
    # a payload command packet.
    a = Agent()
    a.bind_udp_sockets()
    a.service_handler["Payload Command"] = Agent.raise_exception

    # Set a timeout that is << delay.
    Agent.TIMEOUT = 0.005

    # Run agent.
    t = threading.Thread(target=Agent.run, args=(a,))
    t.daemon = True
    t.start()

    # Delay
    time.sleep(0.02)

    # Send a payload command packet -- SHUTDOWN
    p = Packet()
    p.service = Supernova.service_id("Payload Command")
    p.dest_node = Supernova.get_my_id()
    Send.send_to_self(p)   

    # Wait for and then assert that thread has exited.
    t.join(0.01)
    assert not t.is_alive()
예제 #6
0
 def setUp(self):
     exchange1 = Exchange()
     exchange2 = Exchange()
     self.location1 = Location(exchange1)
     self.location2 = Location(exchange2)
     self.agent1 = Agent(location=self.location1)
     self.agent2 = Agent(location=self.location2)
예제 #7
0
def run(args):
    logging.basicConfig(filename=args.LOG_FILE, level=logging.DEBUG)
    logging.getLogger().addHandler(logging.StreamHandler())

    game_handler = GameStateHandler(random_seed=123, frame_skip=args.FRAME_SKIP, use_sdl=False,
                                    image_processing=lambda x: crop_and_resize(x, args.IMAGE_HEIGHT, args.IMAGE_WIDTH))
    game_handler.loadROM(args.ROM_FILE)

    height, width = game_handler.getScreenDims()
    logging.info('Screen resolution is %dx%d' % (height, width))
    num_actions = game_handler.num_actions

    net = theano_qnetwork.DeepQNetwork(args.IMAGE_HEIGHT, args.IMAGE_WIDTH, num_actions, args.STATE_FRAMES, args.DISCOUNT_FACTOR)

    replay_memory = ReplayMemoryManager(args.IMAGE_HEIGHT, args.IMAGE_WIDTH, args.STATE_FRAMES, args.REPLAY_MEMORY_SIZE)

    monitor = Monitoring(log_train_step_every=100, smooth_episode_scores_over=50)
    agent = Agent(game_handler, net, replay_memory, None, monitor, args.TRAIN_FREQ, batch_size=args.BATCH_SIZE)

    start_epsilon = args.START_EPSILON
    exploring_duration = args.EXPLORING_DURATION

    agent.populate_replay_memory(args.MIN_REPLAY_MEMORY)
    agent.play(train_steps_limit=args.LEARNING_BEYOND_EXPLORING+args.EXPLORING_DURATION, start_eps=start_epsilon,
               final_eps=args.FINAL_EPSILON, exploring_duration=exploring_duration)
예제 #8
0
 def __init__(self, aid, booksList):
     Agent.__init__(self, aid)
     
     self.booksList = booksList
     
     comportamento = ComportamentoAgenteLivraria(self)
     self.behaviours.append(comportamento)
class Environment():
    def __init__(self):
        env = gym.make(ENV)
        self.env = wrappers.Monitor(env, '/tmp/gym/mountaincar_dqn', force=True)
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n
        self.agent = Agent(self.num_states, self.num_actions)

    def run(self):
        complete_episodes = 0
        episode_final = False
        output = open('result.log', 'w')

        print(self.num_states, self.num_actions)
        for episode in range(NUM_EPISODE):
            observation = self.env.reset()
            state = torch.from_numpy(observation).type(torch.FloatTensor)
            state = torch.unsqueeze(state, 0)

            for step in range(MAX_STEPS):
                if episode_final:
                    self.env.render(mode='rgb_array')

                action = self.agent.get_action(state, episode)

                observation_next, _, done, _ = self.env.step(action.item())

                state_next = torch.from_numpy(observation_next).type(torch.FloatTensor)
                state_next = torch.unsqueeze(state_next, 0)

                reward = torch.FloatTensor([0.0])
                if done:
                    state_next = None
                    if 199 <= step:
                        reward = torch.FloatTensor([-1.0])
                        complete_episodes = 0
                    else:
                        reward = torch.FloatTensor([1.0])
                        complete_episodes = complete_episodes + 1

                self.agent.memory(state, action, state_next, reward)
                self.agent.update_q_function()

                state = state_next

                if done:
                    message = 'episode: {0}, step: {1}'.format(episode, step)
                    print(message)
                    output.write(message + '\n')
                    break

                if episode_final:
                    break

                if 10 <= complete_episodes:
                    print('success 10 times in sequence')
                    # episode_final = True

        self.env.close()
        output.close()
예제 #10
0
def main(game_name, lr, num_agents, update_target_every, model_name, tau):
    assert 'NoFrameskip-v4' in game_name

    if 'soft' in model_name:
        update_target_every = 1

    basename = '{}:lr={}:na={}:ute={}:{}'.format(
        game_name[:-14], lr, num_agents, update_target_every, model_name)

    if 'soft' in model_name:
        basename += ':tau={}'.format(tau)

    env = Agent(num_agents, game_name, basename)
    try:
        estimator = get_estimator(model_name, env.action_n, lr, 0.99, tau=tau)
        base_path = os.path.join(train_path, basename)
        print("start training!!")
        dqn(env,
            estimator,
            base_path,
            batch_size=32,
            epsilon=0.01,
            save_model_every=1000,
            update_target_every=update_target_every,
            learning_starts=200,
            memory_size=100000,
            num_iterations=40000000)
    except KeyboardInterrupt:
        print("\nKeyboard interrupt!!")
    except Exception:
        traceback.print_exc()
    finally:
        env.close()
예제 #11
0
	def __init__(self, player_id, own_dice_list):
		Agent.__init__(self, player_id, own_dice_list)
		self.num_each_fv = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
		for fv in self.own_dice_list:
			self.num_each_fv[fv] += 1
		self.pg = ProbGenerator((NUM_PLAYERS-1)*NUM_DICE)
		self.pg.calc()
예제 #12
0
파일: run.py 프로젝트: snigavig/triptripod
def run(N):
    """ Runs N episodes of a given length and then runs a demo with greedy policy
	"""
    agent = Agent()

    data = read_data('./data/q.dat')
    if data is not None:
        agent.Q = data

    for i in range(N):
        bot = Bot()
        run_episode(bot, agent, None, draw=False, policy='eps_greedy')
    # if bot.center[1] > 7: print "robot moved on: %i steps" % bot.center[1]

    pg.init()
    pg.display.init()
    surf = pg.display.set_mode((800, 600))
    surf.fill((0, 0, 0))
    pg.display.flip()
    print "Surf1:", surf

    bot = Bot()
    bot.info()
    run_episode(bot, agent, surf, draw=True, policy='eps_greedy', episode_len=60)
    print "Robot's moves:\n", bot.path
    print "Robot walked %i m" % bot.center[1]
    print "Last state value=%.1f" % agent.get_state_value(bot.get_state())
    write_data(agent.Q, "data/q.dat")
    write_path(agent.Q_values, "data/path.csv")
예제 #13
0
    def react(self, message):
        Agent.react(self, message)
        display_message(self.aid.name, 'Uma mensagem recebida')

        if 'agente_teste_participante' in self.aid.name:
            resposta = message.create_reply()
            resposta.set_content('Ola tambem agente!')
            self.send(resposta)
예제 #14
0
	def __init__(self, name, fg, ms, opt):
		Agent.__init__(self, name, fg, ms, opt)

		self.f = self.fg.functions[self.name]
		self.neighbors = self.f.variables
		self.domains = {v:self.fg.variables[v].domain for v in self.neighbors}
		self.q = {v:{value:0 for value in self.domains[v]} for v in self.neighbors}
		self.terminated_neighbors = {v:False for v in self.neighbors}
예제 #15
0
	def __init__(self, aid):
		Agent.__init__(self, aid)
		
		message = ACLMessage(ACLMessage.REQUEST)
		message.set_protocol(ACLMessage.FIPA_REQUEST_PROTOCOL)
		message.set_content('REQUEST')
		message.add_receiver('agent_participant_1')
		comportamento_1 = RequestIniciante(self, message)
		self.addBehaviour(comportamento_1)
예제 #16
0
	def __init__(self, name, fg, ms, opt):
		Agent.__init__(self, name, fg, ms, opt)

		self.v = self.fg.variables[self.name]
		self.neighbors = self.v.functions
		self.domain = self.v.domain
		self.z = {value:0 for value in self.domain}
		self.r = {f:{value:0 for value in self.domain} for f in self.neighbors}
		self.z_queue = []
예제 #17
0
def valueIteration(discountFactor):
	# all locations in grid
	alllocations = [ (x,y) for x in range(11) for y in range(11)]

	# initialize values
	values = {}
	bestMoves = {}
	for predloc in alllocations:
			for preyloc in alllocations:
				if preyloc != predloc:
					values[(predloc,preyloc)] = 0

	agent = Agent(0,0)

	deltas = []
	epsilon = 0.01
	delta = 1
	numIt = 0
	# perform value iteration according to pseud-code
	while delta > epsilon:
		delta = 0
		newValues = {}
		# loop over all states
		for predloc in alllocations:
			for preyloc in alllocations:
				if predloc == preyloc:
					continue
				agent.setLocation(predloc)
				prey = Prey(*preyloc)
				temp = values[(predloc,preyloc)]
				# find optimal value according to current values
				bestVal = 0
				bestMove = (0,0)
				for prob, predMove in agent.getMoveList():
					preySum = 0
					newPredloc = ((predloc[0] + predMove[0])%11,(predloc[1] + predMove[1])%11)
					if newPredloc == preyloc :
						preySum += 10.0
					else:
						for preyProb, newPreyloc in prey.expand(newPredloc):
							preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)]
					if bestVal <= preySum:
						bestVal = preySum
						bestMove = predMove
				newValues[(predloc,preyloc)] = bestVal
				bestMoves[(predloc,preyloc)] = bestMove
				delta = max(delta, np.abs(bestVal - temp))
		values = newValues
		deltas.append(delta)
		numIt+=1
	# greedy policy to the optimal values computed above
	def policy(state):
		predloc, preyloc = state
		agent.setLocation(predloc)
		prey = Prey(*preyloc)
		return bestMoves[(predloc,preyloc)]
	return numIt, values, policy
예제 #18
0
def hello_world():
	get_ntp_time()
	e = threading.Event()
	ip, port, message = "10.1.1.2", 9999, 'Hello World'
	interval = 1
	counter = 20
	t1 = Agent(e,interval,counter,ip,port,message)
	t1.start()
	return message
예제 #19
0
def valueIteration():

	alldiffs = [ (x,y) for x in range(-5,6) for y in range(-5,6)]
	alldiffs.remove((0,0))

	# the relative positions vary from -5 up to 5, in both dimensions
	values = {}
	for x in range(-5,6):
		for y in range(-5,6):
			values[(x,y)] = 0

	bestMoves = {}
	agent = Agent(0,0)

	deltas = []
	discountFactor = 0.8
	epsilon = 0.01
	delta = 1
	while delta > epsilon:
		delta = 0
		newValues = {}
		for diff in alldiffs:
			# we place the predator in the middle of the world,
			# we are allowed to do this, since the positions are encoded relatively
			predloc = (5,5)
			preyloc = (predloc[0]+diff[0],predloc[1]+diff[1])
			curKey  = rewriteStates(predloc,preyloc)
			agent.setLocation(predloc)
			prey = Prey(*preyloc)
			temp = values[curKey]
			bestVal = 0
			bestMove = (0,0)
			for prob, predMove in agent.getMoveList():
				preySum = 0
				newPredloc = agent.locAfterMove(predMove)
				if newPredloc == preyloc :
					preySum += 10.0
				else:
					for preyProb, newPreyloc in prey.expand(newPredloc):
						# using rewriteStates we use relative positions
						preySum += preyProb * discountFactor * values[rewriteStates(newPredloc,newPreyloc)]
				if bestVal <= preySum:
					bestVal = preySum
					bestMove = predMove
			newValues[curKey] = bestVal
			bestMoves[curKey] = bestMove
			delta = max(delta, np.abs(bestVal - temp))
		values = newValues
		deltas.append(delta)

	def policy(state):
		predloc, preyloc = state
		agent.setLocation(predloc)
		prey = Prey(*preyloc)
		return bestMoves[rewriteStates(predloc,preyloc)]
	return policy
예제 #20
0
  def _process_message(self, message):
    for activity in self._activities:
      try:
        result = activity(message)
        self._activities.remove(activity)
        return result
      except MatchError:
        pass

    Agent._process_message(self, message)
예제 #21
0
 def __init__(self, aid, bookStores):
     Agent.__init__(self, aid)
     
     self.bookStores = bookStores
     self.bestPropose = None
     self.bestBookStore = None
     self.proposes = []
     self.messages = []
     self.sends = 0
     self.receives = 0
예제 #22
0
파일: rec-ctl.py 프로젝트: lywen52/Oreco
    def _run(self):
        def sigterm_clean(signum, frame):
            try:
                os.kill(os.getpid(), signal.SIGKILL)
            except:
                pass

        signal.signal(signal.SIGTERM, sigterm_clean)
        agent = Agent()
        agent.main()
예제 #23
0
 def create_agent(self, device):
     datapath_id = device.datapath_id
     device_id = device.id
     for controller_endpoint in self.controller_endpoints:
         agent = Agent(controller_endpoint, datapath_id,
                       device_id, self.grpc_client, self.enable_tls,
                       self.key_file, self.cert_file)
         agent.start()
         self.agent_map[(datapath_id,controller_endpoint)] = agent
         self.device_id_to_datapath_id_map[device_id] = datapath_id
예제 #24
0
파일: clid.py 프로젝트: nagius/cxm
def ctl_panic(*args):
	"""Ask master to engage panic mode."""
	def success(result):
		print "Panic mode requested..."

	agent=Agent()
	d=agent.panic()
	d.addCallback(success)
	
	return d
예제 #25
0
파일: clid.py 프로젝트: nagius/cxm
def ctl_recover(*args):
	"""Ask master to recover from panic mode."""
	def success(result):
		print "Recovering from panic mode... Please check logs."

	agent=Agent()
	d=agent.recover()
	d.addCallback(success)
	
	return d
예제 #26
0
	def __init__(self, player_id):
		Agent.__init__(self, player_id)
		#self.num_each_fv = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
		#for fv in self.own_dice_list:
		#	self.num_each_fv[fv] += 1
		self.pg = ProbGenerator(NUM_PLAYERS*NUM_DICE)
		self.pg.calc()
		self.good_bid_count = 0
		self.num_bids_made = 0
		self.bad_bid_count = 0
예제 #27
0
    def on_start(self):
        Agent.on_start(self)
        display_message(self.aid.name, "Hello World")

        if 'agente_teste_iniciante' in self.aid.name:
            message = ACLMessage(ACLMessage.INFORM)
            message.add_receiver('agente_teste_participante')
            message.set_content('Ola Agente!')
            self.send(message)
            display_message(self.aid.name, 'Enviando mensagem...')
예제 #28
0
 def __init__(self, aid):
     Agent.__init__(self, aid)
     
     pedido = {'tipo' : 'pedido', 'qtd' : 100.0}
     message = ACLMessage(ACLMessage.CFP)
     message.set_protocol(ACLMessage.FIPA_CONTRACT_NET_PROTOCOL)
     message.set_content(dumps(pedido))
     message.add_receiver('participant_agent_1')
     message.add_receiver('participant_agent_2')
     behaviour = InitiatorProtocol(self, message)
     self.addBehaviour(behaviour)
예제 #29
0
    def operT(system,gamma,v):
        numNodes = system.network.numNodes
        numTrt = Agent.numTrt(system)
        numValidTrt = Agent.numValidTrt(numNodes,numTrt)

        vForA = np.zeros((1 << numNodes, numValidTrt))
        for aInd in range(numValidTrt):
            P,R = ValueIteration.calcPAndR(system,aInd)
            vForA[:,aInd] = (R + gamma * (P.dot(v)))

        return np.amax(vForA,1)
예제 #30
0
파일: clid.py 프로젝트: nagius/cxm
def ctl_dump(*args):
	"""Dump current internal memory."""
	def success(result):
		import pprint
		pprint.pprint(result)

	agent=Agent()
	d=agent.getDump()
	d.addCallback(success)
	
	return d
예제 #31
0
    def create_widgets(self):
        for i in range(15):
            for j in range(15):
                f = tk.Frame(self, height=50, width=50)
                f.pack_propagate(0)
                f.grid(row=i, column=j, padx=0, pady=0)
                self.frames.append(f)
                b = tk.Label(f, image=self.image[0], bg="yellow")
                b.pack(fill=tk.BOTH, expand=1)
                b.bind("<Button-1>", self.click(i, j))
                self.button.append(b)

root = tk.Tk()
root.wm_title("Alpha Gomoku")
root.attributes("-topmost", True)

with tf.Session() as sess:
    parser = argparse.ArgumentParser()
    parser.add_argument("model_name", type=str)
    parser.add_argument("--chkpnt", "-c", type=int)
    parser.add_argument("--ensemble", "-e", action="store_true")
    args = parser.parse_args()
    if args.model_name == "minimax":
        agent = MinimaxAgent(max_depth=6, max_width=6)
    elif args.model_name == "mininet":
        agent = MCTSMinimaxAgent(sess, "supervised", chkpnt=args.chkpnt)
    else:
        agent = Agent(sess, args.model_name, chkpnt=args.chkpnt)
    app = Application(agent, root, ensemble=args.ensemble)
    app.mainloop()
예제 #32
0
from agent import Agent
from utils import plot_learning_curve, make_env
import torch as T
from gym import wrappers

env = make_env('PongNoFrameskip-v4')
best_score = -np.inf
load_checkpoint = True
n_games = 1
agent = Agent(gamma=0.99,
              epsilon=0.1,
              lr=0.0001,
              input_dims=(env.observation_space.shape),
              n_actions=(env.action_space.n),
              mem_size=1,
              eps_min=0.1,
              batch_size=32,
              replace=1000,
              eps_dec=1e-5,
              checkpoint_dir='models/',
              algo='DuelingDQNAgent',
              env_name='PongNoFrameskip-v4')

agent.load_models()
print(agent.q_eval)

#env = wrappers.Monitor(env, "tmp/dqn-video", video_callable=lambda episode_id: True, force=True)

n_steps = 0
score = 0
done = False
예제 #33
0
from agent import Agent
from funcs import playMatches

run_version = 1
player1version = 10
player2version = 50
EPISODES = 7
logger = loggers.logger_tourney
turns_until_tau0 = 0

env = Game()
network = ResCNN(config.REG_CONST, config.LEARNING_RATE, env.input_shape,
                 env.action_size, config.HIDDEN_CNN_LAYERS)

network.load(env.name, run_version, player1version)
player1 = Agent('player1', env.state_size, env.action_size, config.MCTS_SIMS,
                config.CPUCT, network)

network.load(env.name, run_version, player2version)
player2 = Agent('player2', env.state_size, env.action_size, config.MCTS_SIMS,
                config.CPUCT, network)

print('Players are ready, Tourney begins!')

goes_first = 0
scores, memory, points, sp_scores = playMatches(player1, player2, EPISODES,
                                                logger, turns_until_tau0, None,
                                                goes_first)

print(scores)
print(points)
print(sp_scores)
예제 #34
0
 def __init__(self):
     self._agent = Agent()
    )
    env_eval_callback = instantiate_eval_callback(env_name=args.env_name)

    if not args.stub_agent:
        agent = Agent(
            algo_name=args.algo_name,
            env_name=args.env_name,
            log_to_tensorboard=args.log_to_tensorboard,
            tb_log_name=args.tb_log_name,
            train_total_timesteps=args.train_total_timesteps,
            n_eval_episodes=args.n_eval_episodes,
            render=args.render,
            num_envs=args.num_envs,
            model_to_load=args.model_to_load,
            continue_learning=args.continue_learning,
            discrete_action_space=args.discrete_action_space,
            eval_callback=args.eval_callback,
            env_variables=env_variables,
            continue_learning_suffix=args.continue_learning_suffix,
            env_eval_callback=env_eval_callback,
            show_progress_bar=args.show_progress_bar,
            log_every=args.log_every,
            sb_version=args.sb_version,
            save_model=args.save_model,
            save_replay_buffer=args.save_replay_buffer,
            model_suffix=args.model_suffix,
        )
    else:
        agent = AgentStub(
            algo_name=args.algo_name,
            env_name=args.env_name,
예제 #36
0
          '...')
    m_tmp = best_NN.read(env.name, initialise.INITIAL_RUN_NUMBER,
                         best_player_version)
    #current_NN.model.set_weights(m_tmp.get_weights())
    best_NN.model.set_weights(m_tmp.get_weights())
#otherwise just ensure the weights on the two players are the same
else:
    best_player_version = 0
    best_NN.model.set_weights(current_NN.model.get_weights())

#copy the config file to the run folder
copyfile('./config.py', run_folder + 'config.py')
#plot_model(current_NN.model, to_file=run_folder + 'models/model.png', show_shapes = True)

print('\n')

######## CREATE THE PLAYERS ########

#current_player = Agent('current_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, current_NN)
best_player = Agent('best_player', env.state_size, env.action_size,
                    config.MCTS_SIMS, config.CPUCT, best_NN)
human_player = WindowUser('lzqt', env.state_size, env.action_size)

game = PlayWithAI(human_player,
                  best_player,
                  lg.logger_main,
                  turns_until_tau0=config.TURNS_UNTIL_TAU0,
                  memory=memory)

game.initial()
game.loop()
예제 #37
0
class Tablero:
    def __init__(self):
        self.dimensiones = (0,0)
        self.casillas = []
        self.agent = Agent()
        self.accion = Accion()
        self.pos_mario = None
        self.pos_tuberias = []
        self.restricciones_posiciones = [
            lambda posicion: posicion[0]<= self.dimensiones[0] and posicion[0] >= 0, #   Coordenadas en x validas  
            lambda posicion: posicion[1]<= self.dimensiones[1] and posicion[1] >= 0, #   Coordenadas en y validas
        ]
        self.restricciones_casillas = [
            lambda casilla: not casilla.es_muro, #                                    No es una casilla muro
            lambda casilla: not casilla.visitado, #                                   No es una casilla visitada
            lambda casilla: not casilla.es_tuberia #                                  No es una casilla tuberia
        ]

    def ponerMurosTuberiasYMario(self, pos_muros, pos_tuberias, pos_mario):
        pos_x = 1
        for fila in self.casillas:
            pos_y = 1
            for casilla in fila:
                if (pos_x, pos_y) in pos_muros:
                    casilla.es_muro=True
                if (pos_x, pos_y) in pos_tuberias:
                    casilla.es_tuberia=True
                if (pos_x, pos_y) == pos_mario:
                    casilla.es_mario=True
                pos_y += 1
            pos_x += 1

    def crearTableroPorParametros(self, dimension_x, dimension_y, pos_muros, pos_tuberias, pos_mario):
        # definimos propiedades utiles de la clase disminuyendo en uno, debido a la adaptacion de indices que inician en 0
        self.pos_tuberias = map(lambda posicion: (posicion[0]-1, posicion[1]-1) ,pos_tuberias)
        self.pos_mario = (pos_mario[0]-1, pos_mario[1]-1)
        self.dimensiones = (dimension_x-1, dimension_y-1)
        # Las casillas se van a formar con las dimensiones proporcionadas
        # Teniendo una lista de listas de la clase Casilla
        self.casillas = [[Casilla() for y in range(dimension_y)] for x in range(dimension_x)]
        self.ponerMurosTuberiasYMario(pos_muros, pos_tuberias, pos_mario)

    def definirPosicionesDeMurosYMario(self):
        for indice_x, fila in enumerate(self.casillas):
            for indice_y, elem in enumerate(fila):
                if elem.es_tuberia:
                    self.pos_tuberias.append((indice_x, indice_y))
                elif elem.es_mario:
                    self.pos_mario = (indice_x, indice_y)


    def crearTableroPorMapa(self, mapa):
        # Se obtinen las dimensiones
        dimension_x = len(mapa) - 1
        dimension_y = len(mapa[0]) - 1
        self.dimensiones = (dimension_x, dimension_y)
        # Se recorre la matriz de simbolos para crear la matriz de Casillas
        for fila in mapa:
            nueva_fila = []
            for elem in fila:
                # para crear nuestra matriz de Casillas aprovechamos para marcar su tipo (muro, tuberia, mario)
                casilla_aux = Casilla()
                casilla_aux.asignarTipo(elem)
                # Agregamos la casilla a la fila
                nueva_fila.append(casilla_aux)
            #Agregamos la fila a la matriz
            self.casillas.append(nueva_fila)
        self.definirPosicionesDeMurosYMario()

    def mostrarCasillas(self):
        dimension_x = self.dimensiones[0]
        dimension_y = self.dimensiones[1]
        # mostrando cabecera con indices verticales del tablero
        print('  ',end='')
        for i in range(dimension_y + 1):
            print(f'__{i + 1}_', end='')
        print()
        pos_x = 1
        # Luego se muestra el indice lateral y los valores por fila
        for fila in self.casillas:
            print(f'{pos_x} |', end='')
            for casilla in fila:
                print(f' {casilla.representacion()} |',end='')
            pos_x += 1
            print()

    # Se usa el algoritmo BFS
    
    
    def habilitarSucesores(self, sucesores):
        habilitados = []
        for sucesor in sucesores:
            if all([restriccion(sucesor) for restriccion in self.restricciones_posiciones]):
                casilla = self.casillas[sucesor[0]][sucesor[1]]
                if all([restriccion(casilla) for restriccion in self.restricciones_casillas]):
                    casilla.visitado = True
                    habilitados.append(sucesor)
        return habilitados

    def designarPadreASucesores(self, sucesores, padre):
        casilla_padre = self.casillas[padre[0]][padre[1]]
        for sucesor in sucesores:
            casilla_hijo = self.casillas[sucesor[0]][sucesor[1]]
            if casilla_hijo.valor == 0 or casilla_hijo.valor > casilla_padre.valor + 1:
                casilla_hijo.valor = casilla_padre.valor + 1
                casilla_hijo.designarPadre(padre)
    def limpiarVisitados(self):
        for fila in self.casillas:
            for casilla in fila:
                casilla.visitado = False

    def expandirSucesores(self, estado):
        acciones = [self.accion.arriba, self.accion.abajo, self.accion.derecha, self.accion.izquierda]
        sucesores = self.agent.funcion_transicion(estado, acciones)
        sucesores = self.habilitarSucesores(sucesores)
        self.designarPadreASucesores(sucesores, estado)
        return sucesores

    def busqueda_de_estados_BFS(self, estados_iniciales):
        # conjunto de colas para varios BFSs
        colas = []
        colas.extend([[estado_inicial] for estado_inicial in estados_iniciales])
        #cerrado = []
        num_expansion = 1
        while sum([len(cola) for cola in colas]) != 0:
            self.mostrarCasillas()
            print(f"Expansion numero {num_expansion}:")
            num_expansion += 1
            for cola in colas:
                if len(cola) != 0:
                    estado = cola.pop(0)
                    cola.extend(self.expandirSucesores(estado))
        self.limpiarVisitados()

    def resolver(self):
        self.busqueda_de_estados_BFS(self.pos_tuberias)

    def caminoParaMario(self ):
        # Primero añadimos la posicion de mario
        camino_nodos = []
        camino_nodos.append(self.pos_mario)
        # Definimos casilla iterable 
        #print(self.pos_mario)
        casilla_actual = self.casillas[self.pos_mario[0]][self.pos_mario[1]]
        num_saltos_requeridos = casilla_actual.valor
        # Recorremos el camino
        while(casilla_actual.padre != None):
            camino_nodos.append(casilla_actual.padre)
            casilla_actual = self.casillas[casilla_actual.padre[0]][casilla_actual.padre[1]]
        # retornamos solucion
        return (num_saltos_requeridos, camino_nodos)

    def mostrarCaminoMario(self):
        num_saltos, camino_nodos = self.caminoParaMario()
        print(f"Mario necesita {num_saltos} pasos para llegar a la tuberia mas cercana")
        print(f"A traves de las siguientes posiciones de casillas\
        {' -> '.join([repr((posicion[0] + 1, posicion[1] + 1)) for posicion in camino_nodos])}")
from environment import Environment
from agent import Agent

env = Environment()
agent = Agent()

FRAMES_TO_RUN = 10

for i in range(FRAMES_TO_RUN):
    env.update_state()
    env.print_state()
예제 #39
0
class PolicyLearner:
    def __init__(self,
                 stock_code,
                 chart_data,
                 training_data=None,
                 min_trading_unit=1,
                 max_trading_unit=2,
                 delayed_reward_threshold=.05,
                 lr=0.01,
                 tax=False):
        self.stock_code = stock_code  # Stock coder
        self.chart_data = chart_data
        self.environment = Environment(chart_data)  # Environment object
        self.tax = tax
        # Agent object
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold,
                           tax=tax)
        self.training_data = training_data  # Training data
        self.sample = None
        self.training_data_idx = -1

        # Policy neural network; Input size = size of training data + agent state size
        self.num_features = self.training_data.shape[1] + self.agent.STATE_DIM
        self.AC = ACagent(input_dim=self.num_features,
                          output_dim=self.agent.NUM_ACTIONS,
                          lr=lr)
        self.visualizer = Visualizer()  # Visualization module

    def reset(self):
        self.sample = None
        self.training_data_idx = -1

    def fit(self,
            num_epoches=1000,
            max_memory=60,
            balance=10000000,
            discount_factor=0,
            start_epsilon=.5,
            learning=True,
            monkey=False):
        logging.info(
            "\n\nAcotr LR: {Alr}, Critic LR: {Clr}, DF: {discount_factor}, "
            "TU: [{min_trading_unit}, {max_trading_unit}], "
            "DRT: {delayed_reward_threshold}, Tax: {tax}".format(
                Alr=self.AC.actor_lr,
                Clr=self.AC.critic_lr,
                discount_factor=discount_factor,
                min_trading_unit=self.agent.min_trading_unit,
                max_trading_unit=self.agent.max_trading_unit,
                delayed_reward_threshold=self.agent.delayed_reward_threshold,
                tax=self.tax))

        # Visualization Preparation
        # Pre-visualization the chart data as it does not change
        self.visualizer.prepare(self.environment.chart_data)

        # Prepare the folders to store visualization results
        epoch_summary_dir = os.path.join(
            settings.BASE_DIR, 'epoch_summary/%s/epoch_summary_%s' %
            (self.stock_code, settings.timestr))
        if not os.path.isdir(epoch_summary_dir):
            os.makedirs(epoch_summary_dir)

        # Set agent's initial balance
        self.agent.set_balance(balance)

        # Initialize the information about training
        max_portfolio_value = 0
        epoch_win_cnt = 0

        # Training repetition
        for epoch in range(num_epoches):
            # Initialize the information about epoch
            #loss = 0.
            itr_cnt = 0
            win_cnt = 0
            exploration_cnt = 0
            batch_size = 0

            # Initialize the memory
            memory_sample = []
            memory_action = []
            memory_reward = []
            memory_prob = []
            memory_pv = []
            memory_num_stocks = []
            memory_exp_idx = []
            memory_learning_idx = []

            # Initialize the environment, agent and policy nerual network
            self.environment.reset()
            self.agent.reset()
            self.AC.reset()
            self.reset()

            # Initialize the visualizer
            self.visualizer.clear([0, len(self.chart_data)])

            # Exploration rate decreases as you progress
            if monkey:
                epsilon = 1
            else:
                if learning:
                    epsilon = start_epsilon * (1. - float(epoch) /
                                               (num_epoches - 1))
                else:
                    epsilon = 0

            while True:
                # Sample generation
                next_sample = self._build_sample()
                if next_sample is None:
                    break

                # Actions decided by policy neural network or exploration
                action, confidence, exploration = self.agent.decide_action(
                    self.AC, self.sample, epsilon)

                # Perform the action you decided and earn immediate and delayed rewards
                immediate_reward, delayed_reward = self.agent.act(
                    action, confidence)

                # Store the actions and the consequences for the actions
                memory_sample.append(next_sample)
                memory_action.append(action)
                memory_reward.append(immediate_reward)
                memory_pv.append(self.agent.portfolio_value)
                memory_num_stocks.append(self.agent.num_stocks)
                memory = [
                    (memory_sample[i], memory_action[i], memory_reward[i])
                    for i in list(range(len(memory_action)))[-max_memory:]
                ]
                if exploration:
                    memory_exp_idx.append(itr_cnt)
                    memory_prob.append([np.nan] * Agent.NUM_ACTIONS)
                else:
                    memory_prob.append(self.AC.prob)

                # Update the information about iterations
                batch_size += 1
                itr_cnt += 1
                exploration_cnt += 1 if exploration else 0
                win_cnt += 1 if delayed_reward > 0 else 0

                # Update policy neural network when in training mode and delay rewards exist
                if delayed_reward == 0 and batch_size >= max_memory:
                    delayed_reward = immediate_reward
                    self.agent.base_portfolio_value = self.agent.portfolio_value
                if learning and delayed_reward != 0:
                    # Size of batch traning data
                    batch_size = min(batch_size, max_memory)
                    # Generate batch training data
                    x, _ = self._get_batch(memory, batch_size, discount_factor,
                                           delayed_reward)
                    if len(x) > 0:
                        # Update Policy neural network
                        self.AC.train_model(self.sample, action,
                                            delayed_reward, next_sample)
                        memory_learning_idx.append([itr_cnt, delayed_reward])
                    batch_size = 0

            # Visualize the information about epoches
            num_epoches_digit = len(str(num_epoches))
            epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0')

            self.visualizer.plot(epoch_str=epoch_str,
                                 num_epoches=num_epoches,
                                 epsilon=epsilon,
                                 action_list=Agent.ACTIONS,
                                 actions=memory_action,
                                 num_stocks=memory_num_stocks,
                                 outvals=memory_prob,
                                 exps=memory_exp_idx,
                                 learning=memory_learning_idx,
                                 initial_balance=self.agent.initial_balance,
                                 pvs=memory_pv)
            self.visualizer.save(
                os.path.join(
                    epoch_summary_dir,
                    'epoch_summary_%s_%s.png' % (settings.timestr, epoch_str)))

            logging.info("[Epoch {}/{}]\tEpsilon:{}\t#Expl.:{}/{}\t"
                         "#Buy:{}\t#Sell:{}\t#Hold:{}\t"
                         "#Stocks:{}\tPV:{:,}원\t".format(
                             epoch_str, num_epoches, round(epsilon,
                                                           4), exploration_cnt,
                             itr_cnt, self.agent.num_buy, self.agent.num_sell,
                             self.agent.num_hold, self.agent.num_stocks,
                             int(self.agent.portfolio_value)))

            # Update the information about training
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

        # Record the information about training in log
        logging.info("Max PV: {:,}원, \t # Win: {}".format(
            int(max_portfolio_value), epoch_win_cnt))

    def _get_batch(self, memory, batch_size, discount_factor, delayed_reward):
        x = np.zeros((batch_size, 1, self.num_features))
        y = np.full((batch_size, self.agent.NUM_ACTIONS), 0.5)

        for i, (sample, action,
                _) in enumerate(reversed(memory[-batch_size:])):
            x[i] = np.array(sample).reshape((-1, 1, self.num_features))
            y[i, action] = (delayed_reward + 1) / 2
            if discount_factor > 0:
                y[i, action] *= discount_factor**i
        return x, y

    def _build_sample(self):
        self.environment.observe()
        if len(self.training_data) > self.training_data_idx + 1:
            self.training_data_idx += 1
            self.sample = self.training_data.iloc[
                self.training_data_idx].tolist()
            self.sample.extend(self.agent.get_states())
            return self.sample
        return None

    def trade(self, model_path=None, balance=2000000):
        if model_path is None:
            return
        self.AC.load_model(model_path=model_path)
        self.fit(balance=balance, num_epoches=1, learning=False)
예제 #40
0
def main():
    #usage arg1 : -s to save, arg2 source comp file, arg3 last test
    file_read = "./stats.txt"
    if (len(sys.argv) > 2):
        file_read = sys.argv[2]

    end = 156
    if (len(sys.argv) > 3):
        end = int(sys.argv[3]) + 1

    last_attempt = [None]

    f = open(file_read, "r").read()
    for i in f.split("\n")[:-1]:
        try:
            info = re.sub(
                r"\(.*?\)", "",
                i.split("duration")[1].replace("[ms]",
                                               "").replace("steps",
                                                           "")).split(":")
            last_attempt.append((int(info[0]), int(info[1])))
        except:
            last_attempt.append(((99999), (9999)))
    try:
        last_attempt_info = json.loads(f.split("\n")[-1])
    except:
        last_attempt_info = {
            "beaten": 0,
            "avg time[ms]": 99999,
            "max time[ms]": 99999,
            "min time[ms]": 99999,
            "avg steps": 99999,
            "max steps": 99999,
            "min steps": 99999
        }
    for i in range(len(last_attempt), 156):
        last_attempt.append(((99999), (9999)))

    current_attempt = [None]
    file_save = None
    if (len(sys.argv) > 1 and sys.argv[1] == "-s"):
        file_save = open("./stats_new.txt", "w")
    for i in range(1, end):
        start_time = time()
        #_original_stdout = sys.stdout
        #sys.stdout = open(os.devnull, 'w')
        steps = asyncio.run(
            Agent(open(f"levels/{i}.xsb").read()).solve(300, float("inf")))
        #sys.stdout.close()
        #sys.stdout = _original_stdout
        #print(">>", steps, "<<")
        elapsed_time = time() - start_time
        if (steps != None):
            current_attempt.append((int(elapsed_time * 1000), len(steps)))
            time_str = "{:>9}".format((str(int(elapsed_time * 1000))) + " ") + \
                "({:>9})".format(
                (str((int(elapsed_time * 1000)) - last_attempt[i][0])))
            level_str = "{:<2}".format(i)
            steps_str = "{:>6}".format(len(steps)) + \
                "({:>6})".format((len(steps) - last_attempt[i][1]))
            if (file_save):
                file_save.write(
                    f"level {level_str} - duration[ms] {time_str}:  steps {steps_str}\n"
                )
                file_save.flush()
            print(
                f"level {level_str} - duration[ms] {time_str}:  steps {steps_str}"
            )
        else:
            current_attempt.append(None)
            if (file_save):
                file_save.write(f"level {i} - Timed out\n")
                file_save.flush()
            print(f"level {i} - Timed out")

    times = [i[0] for i in current_attempt if i is not None]
    stepss = [i[1] for i in current_attempt if i is not None]
    beaten = [i for i in current_attempt if i is not None]

    info = {
        "beaten":
        len(beaten),
        "avg time[ms]":
        int(sum(times) / len(times)),
        "max time[ms]":
        max(times),
        "min time[ms]":
        min(times),
        "avg steps":
        int(sum(stepss) / len(stepss)),
        "max steps":
        max(stepss),
        "min steps":
        min(stepss),
        "diff beaten":
        len(beaten) - last_attempt_info["beaten"],
        "diff avg time[ms]":
        int(sum(times) / len(times)) - last_attempt_info["avg time[ms]"],
        "diff max time[ms]":
        max(times) - last_attempt_info["max time[ms]"],
        "diff min time[ms]":
        min(times) - last_attempt_info["min time[ms]"],
        "diff avg steps":
        int(sum(stepss) / len(stepss)) - last_attempt_info["avg steps"],
        "diff max steps":
        max(stepss) - last_attempt_info["max steps"],
        "diff min steps":
        min(stepss) - last_attempt_info["min steps"],
    }
    if (file_save):
        file_save.write(json.dumps(info))
        file_save.flush()
    print(json.dumps(info))
예제 #41
0
start = time.time()


def updateMemory(agent):
    location = 'trainlocally/multi'
    fileNames = [f"{location}/Data/{f}" for f in listdir(f"{location}/Data/") if isfile(f"{location}/Data/{f}")]
    if len(fileNames) == 0:
        return
    fileNames.sort(key=lambda x: getsize(x), reverse=True)
    fileName = fileNames[0]
    if getsize(fileName) == 0:
        return
    try:
        with open(fileName, 'rb') as file:
            memory = pickle.load(file)
            remove(fileName)
        for mem in memory:
            agent.remember(*mem)
    except Exception as e:
        print(e)
        return


agent = Agent(memory=30000)
i = 0
while time.time() - start < 300:
    updateMemory(agent)
    print(i)
    time.sleep(1)
    i += 1
예제 #42
0
def main():
    epsilon, discount, alpha, iterations, selfPlay, readValues = get_args(
        sys.argv)
    if selfPlay == True:
        agent1 = Agent(1, epsilon, discount, alpha)
        agent2 = Agent(-1, epsilon, discount, alpha)
        print(
            "Beginning self play. Corresponding state values will be stored in agent1_values.txt and agent2_values.txt"
        )
        for i in range(iterations):
            print("Iteration %d..." % (i))
            self_play(agent1, agent2)

        agent1.write_qvalues('agent1_values.txt')
        agent2.write_qvalues('agent2_values.txt')

    elif readValues == True:
        token = 0
        ai = 0
        while (True):
            token = input("What piece would you like to be (X/O)")
            if token == "X" or token == "O":
                break
        if token == "X":
            token = 1
            ai = Agent(token * -1,
                       epsilon=.2,
                       discount=.7,
                       alpha=.7,
                       readValues=True,
                       file="./agent2_values.txt")
        else:
            token = -1
            ai = Agent(token * -1,
                       epsilon=.2,
                       discount=.7,
                       alpha=.7,
                       readValues=True,
                       file="./agent1_values.txt")
        human = Human(token)
        if token == 1:
            # human is X
            play_human_vs_ai(human, ai, token)
            ai.write_qvalues("agent2_values.txt")
        else:
            play_human_vs_ai(ai, human, token)
            ai.write_qvalues("agent1_values.txt")
예제 #43
0
    def __init__(self,
                 boxHeight,
                 boxWidth,
                 boxMargin,
                 rowCount,
                 columnCount,
                 master=None,
                 alpha=0.1,
                 gamma=0.9,
                 epsilon=0.1,
                 episode_count=500,
                 game_sleep=0.1,
                 training_sleep=0.01):
        super().__init__(master)
        self.colorBlack = '#000000'
        self.windowTitle = 'Run Forrest Run!! The RL Game'
        self.colorBlack = '#000000'
        self.colorWhite = '#FAFAFA'
        self.colorGray = '#B9B9B9'
        self.colorRunner = '#0A9F23'
        self.colorChaser = '#9F0A0A'
        self.colorRunnerPlaceConfiguration = '#B5FFB9'
        self.colorChaserPlaceConfiguration = '#FFB5B5'
        self.colorRockPlaceConfiguration = '#B5ECFF'
        self.colorRunnerBehaviourConfiguration = '#F8B5FF'
        self.colorRockCountConfiguration = '#FFFFB5'
        self.colorTurnCountConfiguration = '#B5CFFF'
        self.buttonGrassText = 'grass'
        self.buttonRockText = 'rock'
        self.buttonRunnerText = 'runner'
        self.buttonChaser1Text = 'chaser1'
        self.buttonChaser2Text = 'chaser2'
        self.boxFont = ("Calibri", 22)
        self.buttonFont = ("Calibri", 12)
        self.windowHeight = 302 + boxHeight * (rowCount +
                                               1) + boxMargin * (rowCount + 2)
        self.windowWidth = boxWidth * (columnCount +
                                       1) + boxMargin * (columnCount + 2)
        self.boxWidth = boxWidth
        self.boxHeight = boxHeight
        self.boxMargin = boxMargin
        self.rowCount = rowCount
        self.columnCount = columnCount
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.game_sleep = game_sleep
        self.training_sleep = training_sleep
        self.boardTopMargin = self.windowHeight - (
            self.rowCount + 1) * self.boxHeight - (self.rowCount +
                                                   2) * self.boxMargin
        self.generalHeight = (self.boardTopMargin - 8 * self.boxMargin) / 8
        self.configurationFrameHeaderWidth = (self.windowWidth -
                                              3 * self.boxMargin) * 0.5
        self.configurationFrameHeaderX = self.boxMargin
        self.configurationFrameHeaderY = self.boxMargin
        self.consoleHeaderX = self.boxMargin * 2 + self.configurationFrameHeaderWidth
        self.consoleHeaderY = self.boxMargin
        self.configurationFrameEltWidth = (self.configurationFrameHeaderWidth -
                                           2 * self.boxMargin) / 3
        self.window = master
        self.runnerController = tk.IntVar()
        self.runnerController.set(BehaviorConfig.Auto.value)
        self.runnerPlaceController = tk.IntVar()
        self.runnerPlaceController.set(PlaceConfig.Random.value)
        self.chaserPlaceController = tk.IntVar()
        self.chaserPlaceController.set(PlaceConfig.Random.value)
        self.obstaclePlaceController = tk.IntVar()
        self.obstaclePlaceController.set(PlaceConfig.Random.value)
        self.obstacleCount = tk.IntVar()
        self.obstacleCount.set(20)
        self.obstacleCounter = 0
        self.turnCount = tk.IntVar()
        self.turnCount.set(100)
        self.turnCounter = 0
        self.window.title(self.windowTitle)
        self.window.resizable(0, 0)
        self.window.geometry(
            str(self.windowWidth) + 'x' + str(self.windowHeight))
        self.window.wm_iconphoto(
            False, ImageTk.PhotoImage(Image.open('image/runner.png')))
        self.window.configure(bg=self.colorGray)
        self.imageGrass = tk.PhotoImage(file='image/grass.png')
        self.imageObstacle = tk.PhotoImage(file='image/obstacle.png')
        self.imageRunner = tk.PhotoImage(file='image/runner.png')
        self.imageChaser1 = tk.PhotoImage(file='image/chaser1.png')
        self.imageChaser2 = tk.PhotoImage(file='image/chaser2.png')
        self.buttonImage = {
            ButtonType.Grass: self.imageGrass,
            ButtonType.Obstacle: self.imageObstacle,
            ButtonType.Runner: self.imageRunner,
            ButtonType.Chaser1: self.imageChaser1,
            ButtonType.Chaser2: self.imageChaser2
        }
        self.state = []
        self.original_agent_states = {}
        self.waitingJob = WaitingJob.ApplyConfiguration

        self.isRunnerCaught = False

        self.default_q_table = pd.DataFrame(
            0,
            index=pd.MultiIndex.from_product([
                list(range(1, rowCount + 1)),
                list(range(1, columnCount + 1))
            ]),
            columns=['NO MOVE', 'NORTH', 'EAST', 'SOUTH', 'WEST'])

        #self.runner = Agent(0, 0, ButtonType.Runner, self.default_q_table.copy())
        #self.chaser1 = Agent(0, 0, ButtonType.Chaser1, self.default_q_table.copy())
        #self.chaser2 = Agent(0, 0, ButtonType.Chaser2, self.default_q_table.copy())
        self.runner = Agent(
            0, 0, ButtonType.Runner,
            pd.read_pickle('pickle/runner_' + str(self.rowCount) + '_' +
                           str(self.columnCount) + '.pkl'))
        self.chaser1 = Agent(
            0, 0, ButtonType.Chaser1,
            pd.read_pickle('pickle/chaser1_' + str(self.rowCount) + '_' +
                           str(self.columnCount) + '.pkl'))
        self.chaser2 = Agent(
            0, 0, ButtonType.Chaser2,
            pd.read_pickle('pickle/chaser2_' + str(self.rowCount) + '_' +
                           str(self.columnCount) + '.pkl'))

        self.runnerDefaultPlace = (1, 1)
        self.chaser1DefaultPlace = (self.rowCount, self.columnCount)
        self.chaser2DefaultPlace = (self.rowCount, self.columnCount - 1)

        self.applyConfigButtonList = []

        self.isTraining = False
        self.episode_count = episode_count
        self.episode_counter = 0

        counter = 0
        while counter <= self.columnCount:
            self.window.rowconfigure(counter, weight=1)
            self.window.columnconfigure(counter, weight=1)
            counter += 1

        self.configurationsHeaderLabel = self.makeLabel(
            self.window,
            self.configurationFrameHeaderX,
            self.configurationFrameHeaderY,
            self.generalHeight,
            self.configurationFrameHeaderWidth,
            text='Configurations',
            bg=self.colorWhite)
        self.runnerPlaceConfigurationLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 2 + self.generalHeight * 1,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Runner Place',
            bg=self.colorRunnerPlaceConfiguration)
        self.runnerPlaceDefaultRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 3 + self.generalHeight * 2,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Default',
            bg=self.colorRunnerPlaceConfiguration,
            value=PlaceConfig.Default.value,
            variable=self.runnerPlaceController,
            justify=tk.LEFT)
        self.runnerPlaceRandomRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 4 + self.generalHeight * 3,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Random',
            bg=self.colorRunnerPlaceConfiguration,
            value=PlaceConfig.Random.value,
            variable=self.runnerPlaceController)
        self.runnerPlaceManualRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 5 + self.generalHeight * 4,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Manual',
            bg=self.colorRunnerPlaceConfiguration,
            value=PlaceConfig.Manual.value,
            variable=self.runnerPlaceController)
        self.chaserPlaceConfigurationLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 2 + self.generalHeight * 1,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Chaser Place',
            bg=self.colorChaserPlaceConfiguration)
        self.chaserPlaceDefaultRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 3 + self.generalHeight * 2,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Default',
            bg=self.colorChaserPlaceConfiguration,
            value=PlaceConfig.Default.value,
            variable=self.chaserPlaceController,
            justify=tk.LEFT)
        self.chaserPlaceRandomRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 4 + self.generalHeight * 3,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Random',
            bg=self.colorChaserPlaceConfiguration,
            value=PlaceConfig.Random.value,
            variable=self.chaserPlaceController)
        self.chaserPlaceManualRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 5 + self.generalHeight * 4,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Manual',
            bg=self.colorChaserPlaceConfiguration,
            value=PlaceConfig.Manual.value,
            variable=self.chaserPlaceController)
        self.rockPlaceConfigurationLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 2 + self.boxMargin * 3,
            self.boxMargin * 2 + self.generalHeight * 1,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Rock Place',
            bg=self.colorRockPlaceConfiguration)
        self.rockPlaceDefaultRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 2 + self.boxMargin * 3,
            self.boxMargin * 3 + self.generalHeight * 2,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Default',
            bg=self.colorRockPlaceConfiguration,
            value=PlaceConfig.Default.value,
            variable=self.obstaclePlaceController,
            justify=tk.LEFT)
        self.rockPlaceRandomRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 2 + self.boxMargin * 3,
            self.boxMargin * 4 + self.generalHeight * 3,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Random',
            bg=self.colorRockPlaceConfiguration,
            value=PlaceConfig.Random.value,
            variable=self.obstaclePlaceController)
        self.rockPlaceManualRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 2 + self.boxMargin * 3,
            self.boxMargin * 5 + self.generalHeight * 4,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Manual',
            bg=self.colorRockPlaceConfiguration,
            value=PlaceConfig.Manual.value,
            variable=self.obstaclePlaceController)
        self.runnerBehaviorConfigurationLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 6 + self.generalHeight * 5,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Runner Behavior',
            bg=self.colorRunnerBehaviourConfiguration)
        self.runnerBehaviorAutoRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 6 + self.generalHeight * 5,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Auto',
            bg=self.colorRunnerBehaviourConfiguration,
            value=BehaviorConfig.Auto.value,
            variable=self.runnerController)
        self.runnerBehaviorManualRB = self.makeRadioButton(
            self.window,
            self.configurationFrameEltWidth * 2 + self.boxMargin * 3,
            self.boxMargin * 6 + self.generalHeight * 5,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Manual',
            bg=self.colorRunnerBehaviourConfiguration,
            value=BehaviorConfig.Manual.value,
            variable=self.runnerController)
        self.turnCountLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 7 + self.generalHeight * 6,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Turn Count',
            bg=self.colorTurnCountConfiguration)
        self.turnCountSpinbox = self.makeSpinbox(
            self.window,
            self.configurationFrameEltWidth * 0 + self.boxMargin * 1,
            self.boxMargin * 8 + self.generalHeight * 7,
            self.generalHeight,
            self.configurationFrameEltWidth,
            textvariable=self.turnCount,
            from_=10,
            to=1000)
        self.obstacleCountLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 7 + self.generalHeight * 6,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Obstacle Count',
            bg=self.colorRockCountConfiguration)
        self.obstacleCountSpinbox = self.makeSpinbox(
            self.window,
            self.configurationFrameEltWidth * 1 + self.boxMargin * 2,
            self.boxMargin * 8 + self.generalHeight * 7,
            self.generalHeight,
            self.configurationFrameEltWidth,
            textvariable=self.obstacleCount,
            from_=1,
            to=50)
        self.applyConfigurationButton = self.makeButton(
            self.window,
            self.configurationFrameEltWidth * 2 + self.boxMargin * 3,
            self.boxMargin * 7 + self.generalHeight * 6,
            self.generalHeight * 2 + self.boxMargin,
            self.configurationFrameEltWidth,
            ButtonType.ApplyConfig,
            text='Apply\nConfiguration',
            bg='blue',
            fg=self.colorWhite,
            font=self.buttonFont)
        self.console = self.makeTextbox(
            self.window,
            self.consoleHeaderX,
            self.consoleHeaderY,
            self.windowHeight - (self.rowCount + 6) * self.boxMargin -
            (self.rowCount + 1) * self.boxHeight - self.generalHeight * 3,
            self.configurationFrameHeaderWidth,
            bg='light yellow')
        self.runnerScoreLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 3 + self.boxMargin * 4,
            self.boxMargin * 6 + self.generalHeight * 5,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Runner Score',
            bg=self.colorWhite)
        self.runnerScoreBoard = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 3 + self.boxMargin * 4,
            self.boxMargin * 7 + self.generalHeight * 6,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='0',
            bg=self.colorWhite)
        self.chaser1ScoreLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 4 + self.boxMargin * 5,
            self.boxMargin * 6 + self.generalHeight * 5,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Chaser 1 Score',
            bg=self.colorWhite)
        self.chaser1ScoreBoard = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 4 + self.boxMargin * 5,
            self.boxMargin * 7 + self.generalHeight * 6,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='0',
            bg=self.colorWhite)
        self.chaser2ScoreLabel = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 5 + self.boxMargin * 6,
            self.boxMargin * 6 + self.generalHeight * 5,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='Chaser 2 Score',
            bg=self.colorWhite)
        self.chaser2ScoreBoard = self.makeLabel(
            self.window,
            self.configurationFrameEltWidth * 5 + self.boxMargin * 6,
            self.boxMargin * 7 + self.generalHeight * 6,
            self.generalHeight,
            self.configurationFrameEltWidth,
            text='0',
            bg=self.colorWhite)
        self.startGameButton = self.makeButton(
            self.window,
            self.configurationFrameEltWidth * 3 + self.boxMargin * 4,
            self.boxMargin * 8 + self.generalHeight * 7,
            self.generalHeight,
            self.configurationFrameEltWidth * 1.5 + self.boxMargin * 1,
            ButtonType.StartGame,
            text='Start Game',
            bg='blue',
            fg=self.colorWhite,
            font=self.buttonFont,
            state='disabled')
        self.trainAgentsButton = self.makeButton(
            self.window,
            self.configurationFrameEltWidth * 4.5 + self.boxMargin * 5,
            self.boxMargin * 8 + self.generalHeight * 7,
            self.generalHeight,
            self.configurationFrameEltWidth * 1.5 + self.boxMargin * 1,
            ButtonType.StartGame,
            text='Train Agents',
            bg='yellow',
            fg=self.colorWhite,
            font=self.buttonFont,
            state='disabled')

        self.applyConfigurationButton.bind('<1>', self.handleEvent)
        self.startGameButton.bind('<1>', self.handleEvent)
        self.trainAgentsButton.bind('<1>', self.trainAgents)

        self.applyConfigButtonList.append(self.runnerPlaceDefaultRB)
        self.applyConfigButtonList.append(self.runnerPlaceRandomRB)
        self.applyConfigButtonList.append(self.runnerPlaceManualRB)
        self.applyConfigButtonList.append(self.chaserPlaceDefaultRB)
        self.applyConfigButtonList.append(self.chaserPlaceRandomRB)
        self.applyConfigButtonList.append(self.chaserPlaceManualRB)
        self.applyConfigButtonList.append(self.rockPlaceDefaultRB)
        self.applyConfigButtonList.append(self.rockPlaceRandomRB)
        self.applyConfigButtonList.append(self.rockPlaceManualRB)
        self.applyConfigButtonList.append(self.runnerBehaviorAutoRB)
        self.applyConfigButtonList.append(self.runnerBehaviorManualRB)
        self.applyConfigButtonList.append(self.turnCountSpinbox)
        self.applyConfigButtonList.append(self.obstacleCountSpinbox)
        self.applyConfigButtonList.append(self.applyConfigurationButton)

        self.elts = []

        self.appendToConsole('Welcome to the RL Game - Run Forrest Run!!')
        self.appendToConsole('Waiting for configuration...')
        self.initializeElts()
        self.initializeState()
        self.window.mainloop()
예제 #44
0
from agent import Agent
import matplotlib.pyplot as plt
import random
random.seed(1)
import numpy as np
from utils import compute_mse, Trace

if __name__ == '__main__':
    """
    test sarsa lambda algorithm
    """
    """ the learning curve of mean-squared error
        against episode number for lambda = 0 and lambda = 1
    """
    env = Environment()
    agent = Agent(env)
    print(
        'the learning curve of mean-squared error against episode number for')
    print('lambda = 0')
    agent.td_learning(10000, 0.0, True, trace=Trace.accumulating)

    agent.reset()
    print('lambda = 1')
    agent.td_learning(10000, 1.0, True, trace=Trace.accumulating)

    agent.reset()
    print('The mean-squared error against lambda')
    monte_carlo_iterations = 1000000
    td_iterations = 10000

    agent.monte_carlo_control(monte_carlo_iterations)
예제 #45
0
class Environment:
    def __init__(self, Double, Dueling, PER):
        self.env = gym.make(ENV)  # 태스크를 설정
        num_states = self.env.observation_space.shape[
            0]  # 태스크의 상태 변수 수(4)를 받아옴
        num_actions = self.env.action_space.n  # 태스크의 행동 가짓수(2)를 받아옴
        self.Double = Double
        self.Dueling = Dueling
        self.PER = PER
        self.agent = Agent(num_states, num_actions, Double, Dueling,
                           PER)  # 에이전트 역할을 할 객체를 생성

        self.NumEpisode = []
        self.AvgSteps = []

    def run(self):
        '''실행'''
        episode_10_list = np.zeros(
            10)  # 최근 10에피소드 동안 버틴 단계 수를 저장함 (평균 단계 수를 출력할 때 사용)
        complete_episodes = 0  # 현재까지 195단계를 버틴 에피소드 수
        episode_final = False  # 마지막 에피소드 여부
        frames = []  # 애니메이션을 만들기 위해 마지막 에피소드의 프레임을 저장할 배열

        for episode in range(NUM_EPISODES):  # 최대 에피소드 수만큼 반복
            observation = self.env.reset()  # 환경 초기화

            state = observation  # 관측을 변환 없이 그대로 상태 s로 사용
            state = torch.from_numpy(state).type(
                torch.FloatTensor)  # NumPy 변수를 파이토치 Tensor로 변환
            state = torch.unsqueeze(state, 0)  # size 4를 size 1*4로 변환

            for step in range(MAX_STEPS):  # 1 에피소드에 해당하는 반복문
                #if episode_final is True: # 마지막 에피소드에서는 각 시각의 이미지를 frames에 저장한다.
                #    frames.append(self.env.render(mode='rgb_array'))

                action = self.agent.get_action(state, episode)  # 다음 행동을 결정

                # 행동 a_t를 실행해 다음 상태 s_{t+1}과 done 플래그 값을 결정
                # action에 .item()을 호출해 행동 내용을 구함
                observation_next, _, done, _ = self.env.step(
                    action.item())  # reward와 info는 사용하지 않으므로 _로 처리

                # 보상을 부여하고 episode의 종료 판정 및 state_next 를 설정
                if done:  # 단계 수가 200을 넘었거나 봉이 일정 각도 이상 기울면 done이 True가 됨
                    state_next = None  # 다음 상태가 없으므로 None으로

                    # 최근 10 에피소드에서 버틴 단계 수를 리스트에 저장
                    episode_10_list = np.hstack(
                        (episode_10_list[1:], step + 1))

                    if step < 195:
                        #if step < 295:
                        reward = torch.FloatTensor(
                            [-1.0])  # 도중에 봉이 쓰러졌다면 페널티로 보상 -1을 부여
                        complete_episodes = 0  # 연속 성공 에피소드 기록을 초기화

                    else:
                        reward = torch.FloatTensor(
                            [1.0])  # 봉이 서 있는 채로 에피소드를 마쳤다면 보상 1 부여
                        complete_episodes = complete_episodes + 1  # 연속 성공 에피소드 기록을 갱신

                    # 그림 그리기 위해 저장
                    self.NumEpisode.append(episode)
                    self.AvgSteps.append(episode_10_list[-1])

                else:
                    reward = torch.FloatTensor([0.0])  # 그 외의 경우는 보상 0을 부여
                    state_next = observation_next  # 관측 결과를 그대로 상태로 사용
                    state_next = torch.from_numpy(state_next).type(
                        torch.FloatTensor)  # NumPy 변수를 파이토치 텐서로 변환
                    state_next = torch.unsqueeze(state_next,
                                                 0)  # size 4를 size 1*4ㄹㄹ로 변환

                # 메모리에 경험을 저장
                self.agent.memorize(state, action, state_next, reward)

                # TD 오차 메모리에 TD 오차를 저장
                # Prioritized Experience Replay 에서 추가됨
                if self.PER == True:
                    self.agent.memorize_td_error(0)  # 여기서는 정확한 값 대신 0을 저장함

                # Experience Replay로 Q함수를 수정
                if self.PER == True:
                    self.agent.update_q_function(episode)
                else:
                    self.agent.update_q_function()

                # 관측 결과를 업데이트
                state = state_next

                # 에피소드 종료 처리
                if done:
                    print('DQN with Double : %r, Dueling : %r, PER : %r' %
                          (self.Double, self.Dueling, self.PER))
                    print(
                        '%d Episode: Finished after %d steps : 최근 10 에피소드의 평균 단계 수 = %.1lf'
                        % (episode, step + 1, episode_10_list.mean()))

                    # PER - TD 오차 메모리의 TD 오차를 업데이트
                    if self.PER == True:
                        self.agent.update_td_error_memory()

                    # DDQN
                    if (episode % 2 == 0):
                        self.agent.update_target_q_function()
                    break

                if episode_final is True:
                    # 애니메이션 생성 및 저장
                    #display_frames_as_gif(frames)

                    break

                # 10 에피소드 연속으로 195단계를 버티면 태스크 성공
                if complete_episodes >= 5:
                    print(
                        '---- DQN with Double : %r, Dueling : %r, PER : %r ----'
                        % (self.Double, self.Dueling, self.PER))
                    print('10 에피소드 연속 성공')
                    print(
                        '------------------------------------------------------'
                    )
                    # 그림 그리기
                    filename = "DQN_Double_%r_Dueling_%r_PER_%r_" % (
                        self.Double, self.Dueling,
                        self.PER) + datetime.datetime.now().strftime(
                            '%Y-%m-%d %H %M') + '.png'
                    directory = './SaveResult'
                    savepath = os.path.join(directory, filename)
                    plt.figure('%d%d%d' %
                               (self.Double, self.Dueling, self.PER))
                    plt.scatter(self.NumEpisode, self.AvgSteps)
                    plt.xlabel('num of episode')
                    plt.ylabel('average steps')
                    plt.title('DQN with Double : %r, Dueling : %r, PER : %r' %
                              (self.Double, self.Dueling, self.PER))
                    plt.grid()
                    plt.savefig(savepath)
                    plt.show()
                    episode_final = True  # 다음 에피소드에서 애니메이션을 생성
예제 #46
0
class PolicyLearner:
    def __init__(self,
                 load_model=True,
                 learning_rate=0.005,
                 min_trading_unit=0,
                 max_trading_unit=10,
                 delayed_reward_threshold=.01,
                 training=True):

        self.environment = Environment()
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)

        self.batch_size = 2
        self.update_freq = 4
        self.y = .99
        self.discount_factor = .8  #0.8**30 = 0.004
        self.startE = 1
        self.endE = 0.1
        self.anneling_steps = 10000.
        self.num_episodes = 10000
        self.pre_train_steps = 200
        self.max_epLength = 20
        self.replay_memory = 10
        self.training_step = 5

        self.load_model = load_model
        self.path = './dqn'

        # 모델을 세이브할 장소를 만든다.
        if not os.path.exists(self.path):
            os.makedirs(self.path)

        # self.h_size = 512
        self.tau = 0.001

        tf.reset_default_graph()

        self.network_type = [20, 25]  #, 6, 7]

        self.buffer_size = 0
        for image_type in self.network_type:
            image_size = 1
            for shape in self.environment.RANGE_SHAPE[image_type]:
                image_size *= shape
            self.buffer_size += image_size

        self.buffer_size = ((15 * (1024**3)) //
                            (self.buffer_size * 2 *
                             self.max_epLength)) // 10 * 10  #10GB / Imagesize
        print(self.buffer_size)
        self.mainQN = [
            Qnetwork(learning_rate=learning_rate,
                     model_type=type,
                     name='main_' + str(type)) for type in self.network_type
        ]
        if training:
            self.targetQN = [
                Qnetwork(learning_rate=learning_rate,
                         model_type=type,
                         name='target_' + str(type))
                for type in self.network_type
            ]
        '''
        self.mainQN = [Qnetwork(learning_rate=learning_rate, model_type=5), Qnetwork(learning_rate=learning_rate, model_type=20),
                       Qnetwork(learning_rate=learning_rate, model_type=60)]
        self.targetQN = [Qnetwork(learning_rate=learning_rate, model_type=5), Qnetwork(learning_rate=learning_rate, model_type=20),
                         Qnetwork(learning_rate=learning_rate, model_type=60)]
        '''

    def train(self):
        init = tf.global_variables_initializer()
        saver = tf.train.Saver(max_to_keep=1, reshape=True)
        trainables = tf.trainable_variables()

        targetOps = updateTargetGraph(trainables, self.tau)
        rList = []
        #portfolio_list=[]
        total_steps = 0
        myBuffer = experience_buffer(self.buffer_size)
        episode_buffer = experience_buffer()
        e = self.startE

        stepDrop = (self.startE - self.endE) / self.anneling_steps
        with tf.Session() as sess:
            # 변수를 초기화한다.
            sess.run(init)
            if self.load_model == True:
                print('Loading Model...')
                # 모델을 불러온다
                ckpt = tf.train.get_checkpoint_state(self.path)
                saver.restore(sess, ckpt.model_checkpoint_path)
                e = self.endE

            # 주요 신경망과 동일하게 타겟 신경망을 설정한다
            updateTarget(targetOps, sess)
            # 에피소드 시작
            for ii in range(self.num_episodes):
                rAll = 0
                d = False
                j = 0
                episode_buffer.buffer = []
                episode_reward_buffer = []
                self.environment.reset()
                self.agent.reset()
                rnn_state = np.array(
                    [mainQN.state_init for mainQN in self.mainQN])
                #print('%d 번째 episode 초기화 :' % ii,self.environment.idx, self.environment.KOSPI_idx, 'total num :',total_steps, '종목코드',self.environment.chart_code)
                s = [
                    self.environment.get_image(days)
                    for days in self.network_type
                ]
                s_potfol = np.array(self.agent.get_states())

                episode_step = 1

                while j < self.max_epLength and not d:

                    j += 1

                    #입력값으로 행동선택하기(베이시안 + 볼트만)
                    all_Q_d = np.zeros([self.agent.NUM_ACTIONS])
                    before_rnn_state = rnn_state[:]
                    for i, mainQN in enumerate(self.mainQN):
                        Q_d, rnn_state[i] = sess.run(
                            [mainQN.Q_dist, mainQN.state_out],
                            feed_dict={
                                mainQN.inImage: [s[i]],
                                mainQN.portfolio_state: [s_potfol],
                                mainQN.state_in[0]: rnn_state[i][0],
                                mainQN.state_in[1]: rnn_state[i][1],
                                mainQN.temp: e,
                                mainQN.keep_per: (1 - e) + 0.1,
                                mainQN.phase: True
                            })
                        all_Q_d += Q_d[0]
                    #모든 신경망의 확률값을 더한 뒤 나눔
                    #print(np.sum(all_Q_d))
                    all_Q_d /= len(self.network_type)
                    all_Q_d /= np.sum(all_Q_d)
                    #print(np.sum(all_Q_d))
                    a = np.random.choice(all_Q_d, p=all_Q_d)
                    action = np.argmax(all_Q_d == a)
                    #정책에 행동전달
                    delayed_reward = self.agent.act(action=action,
                                                    confidence=all_Q_d[action])
                    d = self.environment.step()
                    if e > self.endE and total_steps > self.pre_train_steps:
                        e -= stepDrop
                    '''
                    immediate_reward, delayed_reward = self.agent.act(action=action, confidence=all_Q_d[action])

                    if e > self.endE and total_steps > self.pre_train_steps:
                        e -= stepDrop


                    #다음 인덱스로 넘어가기
                    d = self.environment.step()
                    if (delayed_reward == 0 and episode_step % 5 == 0) or d:
                        delayed_reward = immediate_reward
                        self.agent.base_portfolio_value = self.agent.portfolio_value
                  '''
                    #다음이미지,포폴 받기
                    #print('total step :', total_steps, 'current episode step : ', j, 'idx :', self.environment.idx, 'kospi_idx', self.environment.KOSPI_idx, '종목코드',self.environment.chart_code)
                    s1 = [
                        self.environment.get_image(days)
                        for days in self.network_type
                    ]
                    s1_potfol = np.array(self.agent.get_states())

                    episode_reward_buffer.append(delayed_reward)
                    #버퍼에 저장
                    # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부
                    # 수정 버퍼    : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴
                    # 재수정 버퍼    : 현재이미지 액션 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 보상(디스카운트 펙터설정)
                    #episode_buffer.add([s, action, delayed_reward, s1, s1_potfol, d, before_rnn_state, rnn_state, s_potfol  ]  )
                    episode_buffer.add([
                        s, action, s1, s1_potfol, d, before_rnn_state,
                        rnn_state, s_potfol
                    ])
                    if total_steps > self.pre_train_steps and total_steps % self.training_step == 0:
                        try:
                            #버퍼에서 데이터 가져오기
                            # 학습 모드이고 지연 보상이 존재할 경우 정책 신경망 갱신

                            #원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부
                            # 수정 버퍼    : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴
                            # 배치 학습 데이터 크기
                            trainBatch, size = myBuffer.sample(
                                self.replay_memory, rList)  #(self.batch_size)
                            #print('훈련데이터 추출 결과 : ', trainBatch.shape)
                            #보상을 전행동에 영향이 가도록 할인인자로 곱해야함

                            for i in range(len(self.network_type)):
                                # 아래는 target Q-value를 업데이트하는 Double-DQN을 수행한다
                                # 주요 신경망에서 행동을 고른다.
                                #학습 시 베이시안과 볼트만을 사용하지 않음

                                #LSTM 학습을 위해서 랜덤한 에피소드에 랜덤한 날짜부터 replay memory만큼 선정하고 사용함
                                # 재수정 버퍼    : 현재이미지 액션 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 보상(디스카운트 펙터설정)

                                feed_dict = {
                                    self.mainQN[i].inImage:
                                    [datas[i] for datas in trainBatch[:, 2]],
                                    self.mainQN[i].portfolio_state:
                                    [data for data in trainBatch[:, 3]],
                                    self.mainQN[i].state_in[0]:
                                    trainBatch[0, 6][i][0],
                                    self.mainQN[i].state_in[1]:
                                    trainBatch[0, 6][i][1],
                                    self.mainQN[i].keep_per:
                                    1.0,
                                    self.mainQN[i].phase:
                                    True
                                }
                                Q1 = sess.run(self.mainQN[i].predict,
                                              feed_dict=feed_dict)
                                del feed_dict
                                feed_dict_2 = {
                                    self.targetQN[i].inImage:
                                    [datas[i] for datas in trainBatch[:, 2]],
                                    self.targetQN[i].portfolio_state:
                                    [data for data in trainBatch[:, 3]],
                                    self.targetQN[i].state_in[0]:
                                    trainBatch[0, 6][i][0],
                                    self.targetQN[i].state_in[1]:
                                    trainBatch[0, 6][i][1],
                                    self.targetQN[i].keep_per:
                                    1.0,
                                    self.targetQN[i].phase:
                                    True
                                }
                                Q2 = sess.run(
                                    self.targetQN[i].Qout,  # feed_dict 수정해야함
                                    feed_dict=feed_dict_2)
                                del feed_dict_2
                                '''
                                Q1 = sess.run(self.mainQN[i].predict,
                                              feed_dict={self.mainQN[i].inImage: np.vstack(trainBatch[:, 3])})
                            
                                # 타겟 신경망에서 Q 값들을 얻는다.
                                Q2 = sess.run(self.targetQN[i].Qout, #feed_dict 수정해야함
                                              feed_dict={self.targetQN[i].inImage: np.vstack(trainBatch[:, 3])})
                            '''
                                # 종료 여부에 따라 가짜 라벨을 만들어준다
                                end_multiplier = -(trainBatch[:, 4] - 1)
                                # 타겟 신경망의 Q 값들 중에 주요 신경망에서 고른 행동 번째의 Q 값들을 가져온다.(이부분이 doubleQ)
                                doubleQ = Q2[range(size), Q1]
                                # 보상에 대한 더블 Q 값을 더해준다. y는 할인 인자
                                # targetQ 는 즉각적인 보상 + 다음 상태의 최대 보상(doubleQ)
                                targetQ = trainBatch[:, 8] + (
                                    self.y * doubleQ * end_multiplier)
                                # 우리의 타겟 값들과 함께 신경망을 업데이트해준다.
                                # 행동들에 대해서 targetQ 값과의 차이를 통해 손실을 구하고 업데이트
                                # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부
                                # 수정 버퍼    : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴

                                feed_dict = {
                                    self.mainQN[i].inImage:
                                    [datas[i] for datas in trainBatch[:, 0]],
                                    self.mainQN[i].portfolio_state:
                                    [data for data in trainBatch[:, 7]],
                                    self.mainQN[i].targetQ:
                                    targetQ,
                                    self.mainQN[i].actions:
                                    trainBatch[:, 1],
                                    self.mainQN[i].keep_per:
                                    1.0,
                                    self.mainQN[i].state_in[0]:
                                    trainBatch[0, 5][i][0],
                                    self.mainQN[i].state_in[1]:
                                    trainBatch[0, 5][i][1],
                                    self.mainQN[i].phase:
                                    True
                                }
                                _ = sess.run(self.mainQN[i].updateModel, \
                                             feed_dict=feed_dict)
                                del feed_dict
                                '''
                                _ = sess.run(self.mainQN[i].updateModel, \
                                             feed_dict={self.mainQN[i].inImage: np.vstack(trainBatch[:, 0]),
                                                        self.mainQN[i].targetQ: targetQ,
                                                        self.mainQN[i].actions: trainBatch[:, 1]})
                            '''
                            updateTarget(targetOps, sess)
                        except IndexError as e:
                            print(trainBatch)

                    rAll += delayed_reward
                    #rAll = delayed_reward
                    # 상태를 바꾼다.
                    del s
                    s = s1
                    del s_potfol
                    s_potfol = s1_potfol
                    total_steps += 1
                    episode_step += 1

                #portfolio_list.append(self.agent.portfolio_value)
                #할인인자 적용한 보상을 에피소드 버퍼에 추가
                accumulate = 0
                episode_reward_buffer.reverse()
                #print('%s episode_reward_len : ' % ii, len(episode_reward_buffer), 'episode_buffer_len :', len(episode_buffer.buffer))
                for i, reward in enumerate(episode_reward_buffer):
                    accumulate = self.discount_factor * accumulate + reward
                    idx = -(i + 1)
                    episode_buffer.buffer[idx] += [accumulate]
                    #print(idx, len(episode_buffer.buffer[idx]))

                myBuffer.add(episode_buffer.buffer)
                if len(rList) + 1 >= self.buffer_size:
                    # self.buffer[0:1] = []
                    del rList[0]
                rList.append(rAll)
                self.environment.chartcode_value[
                    self.environment.
                    chart_code] += 1 if self.agent.portfolio_value > self.agent.initial_balance else -1
                print("%d %s %d %d %d %d" %
                      (ii, self.environment.chart_code, rAll,
                       self.agent.portfolio_value,
                       self.agent.minimum_portfolio_value,
                       self.agent.maximum_portfolio_value))
                #print("%d %4f %d %4f %4f %d %d"% (total_steps, np.mean(rList[-10:]), np.mean(portfolio_list), np.max(rList[-10:]),np.min(rList[-10:]),np.max(portfolio_list),np.min(portfolio_list)))#e)
                #print(sys.getsizeof(myBuffer.buffer), sys.getsizeof(episode_buffer.buffer))
                #portfolio_list= []
                if total_steps > self.pre_train_steps and ii % 50 == 0:
                    try:
                        saver.save(sess,
                                   self.path + '/model-' + str(ii) + '.cptk')
                        with open('./value_chart.txt', 'w') as f:
                            data = json.dumps(self.environment.chartcode_value)
                            f.write(data)
                        del data
                        #print("Saved Model")
                    except:
                        pass
                sleep(2)
            # 학습 끝 평균 보상을 표시
            saver.save(sess, self.path + '/model-' + str(ii) + '.cptk')
            print("평균 episode 별 보상 값 : " + str(sum(rList) / self.num_episodes))

    def test(self):
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        trainables = tf.trainable_variables()

        targetOps = updateTargetGraph(trainables, self.tau)
        rList = []
        total_steps = 0
        myBuffer = experience_buffer()
        episode_buffer = experience_buffer()
        e = self.startE

        stepDrop = (self.startE - self.endE) / self.anneling_steps
        with tf.Session() as sess:
            if self.load_model == True:
                print('Loading Model...')
                # 모델을 불러온다
                ckpt = tf.train.get_checkpoint_state(self.path)
                self.saver.restore(sess, ckpt.model_checkpoint_path)
            # 변수를 초기화한다.
            sess.run(init)
            # 주요 신경망과 동일하게 타겟 신경망을 설정한다
            updateTarget(targetOps, sess)
            # 에피소드 시작
            for ii in range(self.num_episodes):
                rAll = 0
                d = False
                j = 0
                experience_buffer.buffer = []
                self.environment.reset()
                self.agent.reset()
                rnn_state = np.array(
                    [mainQN.state_init for mainQN in self.mainQN])
                print('%d 번째 episode 초기화 :' % ii, self.environment.idx,
                      self.environment.KOSPI_idx, 'total num :', total_steps,
                      '종목코드', self.environment.chart_code)
                s = [
                    self.environment.get_image(days)
                    for days in self.network_type
                ]
                s_potfol = np.array(self.agent.get_states())
                while j < self.max_epLength and not d:

                    j += 1

                    # 입력값으로 행동선택하기(베이시안 + 볼트만)
                    all_Q_d = np.zeros([self.agent.NUM_ACTIONS])
                    before_rnn_state = rnn_state[:]
                    for i, mainQN in enumerate(self.mainQN):
                        Q_d, rnn_state[i] = sess.run(
                            [mainQN.Q_dist, mainQN.state_out],
                            feed_dict={
                                mainQN.inImage: [s[i]],
                                mainQN.portfolio_state: [s_potfol],
                                mainQN.state_in[0]: rnn_state[i][0],
                                mainQN.state_in[1]: rnn_state[i][1],
                                mainQN.temp: e,
                                mainQN.keep_per: (1 - e) + 0.1,
                                mainQN.phase: True
                            })
                        all_Q_d += Q_d[0]
                    # 모든 신경망의 확률값을 더한 뒤 나눔
                    # print(np.sum(all_Q_d))
                    all_Q_d /= len(self.network_type)
                    all_Q_d[0] += 1 - np.sum(all_Q_d)
                    # print(np.sum(all_Q_d))
                    a = np.random.choice(all_Q_d, p=all_Q_d)
                    action = np.argmax(all_Q_d == a)
                    # 정책에 행동전달
                    immediate_reward, delayed_reward = self.agent.act(
                        action=action, confidence=all_Q_d[action])

                    if e > self.endE and total_steps > self.pre_train_steps:
                        e -= stepDrop

                    if delayed_reward == 0 and total_steps % 5 == 0:
                        delayed_reward = immediate_reward
                        self.agent.base_portfolio_value = self.agent.portfolio_value
                    # 다음 인덱스로 넘어가기
                    d = self.environment.step()
                    # 다음이미지,포폴 받기
                    # print('total step :', total_steps, 'current episode step : ', j, 'idx :', self.environment.idx, 'kospi_idx', self.environment.KOSPI_idx, '종목코드',self.environment.chart_code)
                    s1 = [
                        self.environment.get_image(days)
                        for days in self.network_type
                    ]
                    s1_potfol = np.array(self.agent.get_states())
                    # 버퍼에 저장
                    # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부
                    # 수정 버퍼    : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴

                    rAll += delayed_reward
                    # 상태를 바꾼다.
                    del s
                    s = s1
                    del s_potfol
                    s_potfol = s1_potfol
                    total_steps += 1
                if total_steps > self.pre_train_steps and ii % 50 == 0:
                    saver.save(sess, self.path + '/model-' + str(ii) + '.cptk')
                    print("Saved Model")
                    sleep(3)
                rList.append(rAll)
                myBuffer.add(episode_buffer.buffer)
                if len(rList) % 10 == 0:
                    print(total_steps, np.mean(rList[-10:]), e)

                sleep(2)
            # saver.save(sess, self.path + '/model-' + str(i) + '.cptk')
            # 평균 보상을 표시
            print("평균 episode 별 보상 값 : " + str(sum(rList) / self.num_episodes))
예제 #47
0
class management:
    def _init_environment(self, datapath, window_size):

        df = pd.read_csv(datapath)
        bid_price_columns = [i for i in range(1, len(df.columns), 20)]
        print(bid_price_columns)
        ask_price_columns = [i for i in range(3, len(df.columns), 20)]
        bidPrices = df[df.columns[bid_price_columns]]
        askPrices = df[df.columns[bid_price_columns]]
        df_concat = pd.concat([bidPrices, askPrices])
        midPrices = df_concat.groupby(
            df_concat.index).mean().transpose().values[-len(self.securities):]
        print(midPrices[:, 0])

        self.env = DummyVecEnv(
            [lambda: securities_trading_env(np.array(midPrices).T)])
        self.env = VecCheckNan(self.env, raise_exception=True)

        n_actions = self.env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))
        print(n_actions)

        if (self.policy == "DDPG"):
            self.model = DDPG(ddpgMlpPolicy,
                              self.env,
                              verbose=int(self.verbose),
                              param_noise=param_noise,
                              action_noise=action_noise)
        elif (self.policy == "TD3"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        else:
            self.model = PPO2(MlpLnLstmPolicy,
                              self.env,
                              verbose=int(self.verbose))

        if self.load:  #load model
            self.model = self.model.load("save/" + modelpath + ".h5")

        #init model class
        self.gym_model = Agent(market_event_securities, market_event_queue,
                               securities, queue, host, policy, strategy,
                               cash_balance, self.model, self.env, window_size,
                               self.inventory)

    def _init_sec_prices(self, securities):
        sec_state = dict()
        for sec in securities:
            sec_state.setdefault(sec, None)
        return sec_state

    def _init_market_dict(self, market_event_securities, market_event_queue):
        market_dict = dict()
        for sec in market_event_securities:
            sym_dict = dict()
            for e in market_event_queue:
                sym_dict[e] = None
            market_dict[sec] = sym_dict
        return market_dict

    # size of each security hold is set to be 0 initially
    def _init_inventory(self, securities):
        inventory = dict()
        for sec in securities:
            inventory[sec] = 0.0
        return inventory

    def __init__(self, market_event_securities, market_event_queue, securities,
                 queue, host, policy, strategy, cash_balance, load, train,
                 train_only, verbose, modelpath, datapath, train_steps,
                 test_steps, window_size, episodes):

        logging.basicConfig(level=logging.INFO)

        self.policy = policy
        self.strategy = strategy
        self.verbose = verbose
        self.load = load
        self.train = train
        self.modelpath = modelpath

        self.strategy = strategy  # identifier for different clients
        self.market_event_securities = market_event_securities  # strings of securities, e.g. [ZFH0:MBO,ZTH0:MBO,UBH0:MBO,ZNH0:MBO,ZBH0:MBO]
        self.market_event_queue = market_event_queue  # strings of names of prices in market_event_securities, e.g. [L1, L2, L3]
        self.securities = securities

        self.num_of_securities = len(
            self.securities)  # number of securities the bot will trade in
        self.internalID = 0  # internal id for every order the bot wants to send
        self.steps = 0  # number of trades the bot has made

        self.cash_balance = cash_balance
        self.inventory = self._init_inventory(
            self.securities)  # size of each security hold
        self.inventoryValue = 0.0
        self.PnL = self.cash_balance + self.inventoryValue

        self.outputfile = "save/" + strategy + "_logs.txt"

        self._init_environment(datapath, window_size)

        if self.train:  #Train model if true
            for e in range(episodes):
                logging.info(" Episode : %s" % str(e))
                self.gym_model.train_model(train_steps, test_steps)
            self.model = self.gym_model.model
            model_save = "save/" + self.modelpath + ".h5"
            logging.info("Model saved as: " + model_save)
            with open(self.outputfile, "a") as myfile:
                myfile.write("Model saved as: %s \n" % model_save)
            self.model.save(model_save)

        if train_only:
            return

        self.market_dict = self._init_market_dict(
            self.market_event_securities,
            self.market_event_queue)  # L1-L5 levels data
        # self.market_dict["ZTH0:MBO"]["L1"] to read l1 data of ZTH0:MBO
        self.ask_trend = self._init_market_dict(self.market_event_securities,
                                                self.market_event_queue)
        # if self.market_dict["ZTH0:MBO"]["L1"]["L1AskPrice"] goes up, self.ask_trend["ZTH0:MBO"]["L1"] = 1
        # if self.market_dict["ZTH0:MBO"]["L1"]["L1AskPrice"] goes down, self.ask_trend["ZTH0:MBO"]["L1"] = -1
        # if self.market_dict["ZTH0:MBO"]["L1"]["L1AskPrice"] stays the same, self.ask_trend["ZTH0:MBO"]["L1"] = 0
        self.bid_trend = self._init_market_dict(self.market_event_securities,
                                                self.market_event_queue)
        # if self.market_dict["ZTH0:MBO"]["L1"]["L1BidPrice"] goes up, self.bid_trend["ZTH0:MBO"]["L1"] = 1
        # if self.market_dict["ZTH0:MBO"]["L1"]["L1BidPrice"] goes down, self.bid_trend["ZTH0:MBO"]["L1"] = -1
        # if self.market_dict["ZTH0:MBO"]["L1"]["L1BidPrice"] stays the same, self.bid_trend["ZTH0:MBO"]["L1"] = 0

        self.mid_market = self._init_sec_prices(
            securities
        )  # half of the sum of current L1 ask price and L1 bid price

        self.exIds_to_inIds = dict(
        )  # when your order is acked, the bot will receive an external id for it. map exid to inid here.
        self.inIds_to_orders_sent = dict()  # orders sent but not acked
        self.inIds_to_orders_confirmed = dict(
        )  # orders confirmed by matching agent

        self.talk = Communication(market_event_securities,
                                  market_event_queue,
                                  securities,
                                  queue,
                                  host,
                                  callback_for_levels=self.callback_for_levels,
                                  callback_for_acks=self.callback_for_acks,
                                  callback_for_trades=self.callback_for_trades)
        self.talk.kickoff()

    def _save_order_being_sent(self, order):
        self.inIds_to_orders_sent[order["orderNo"]] = order

    def cancel_order(self, order):
        self.talk._cancel_order(order)

    def send_order(self, order):
        if order["side"] == 'B' and self.PnL < order["price"] * order[
                "origQty"]:
            logging.warning("portfolio : " + str(self.PnL))
            logging.warning("Not enough portfolio to buy " +
                            str(order["origQty"]) + " " + order["symb"])
            return False
        elif order["side"] == 'S' and self.inventory[
                order["symb"]] < order["origQty"]:
            logging.warning(order["symb"] + " : " +
                            str(self.inventory[order["symb"]]))
            logging.warning("Not enough " + order["symb"] + " to sell")
            return False
        else:
            order["orderNo"] = self.internalID
            self._save_order_being_sent(order)
            logging.info("\n Order %s is sent" % str(order["orderNo"]))
            self.internalID += 1
            self.talk._send_order(order)
            return True

    def _update_with_trade(self, tradeobj, side, exId):
        # buy side = 1, sell side = -1
        self._update_inventory(tradeobj.symbol, tradeobj.tradeSize * side)
        self._update_inventory_value()
        self._update_cash(tradeobj.tradeSize, tradeobj.tradePrice * (-side))
        self._update_pnl()
        self._update_order_remain(exId, tradeobj.tradeSize)
        logging.info(" [X] Cash : %s" % str(self.cash_balance))
        logging.info(" [X] Inventory Value : %s" % str(self.inventoryValue))
        logging.info(" [X] Portfolio Value : %s" % str(self.PnL))
        with open(self.outputfile, "a") as myfile:
            myfile.write(" [X] Cash : %s\n" % str(self.cash_balance))
            myfile.write(" [X] Inventory Value : %s\n" %
                         str(self.inventoryValue))
            myfile.write(" [X] Portfolio Value : %s\n" % str(self.PnL))

    def _update_inventory(self, symbol, size):
        self.inventory[symbol] += size
        logging.debug(" [X] inventory:")
        with open(self.outputfile, "a") as myfile:
            for sec in self.securities:
                logging.info("%s : %d" % (sec, self.inventory[sec]))
                myfile.write("%s : %d\n" % (sec, self.inventory[sec]))

    def _update_inventory_value(self, ):
        inventoryValue = 0.0
        for sec in self.securities:
            if self.mid_market[sec] is not None:
                inventoryValue += self.inventory[sec] * self.mid_market[sec]
        self.inventoryValue = inventoryValue
        logging.debug(" [X] inventory value: %d" % self.inventoryValue)
        for sec in self.securities:
            logging.debug("%s : %d" % (sec, self.inventory[sec]))

    def _update_cash(self, size, price):
        self.cash_balance += size * price
        logging.debug(" [X] cash balance: %d" % self.cash_balance)

    def _update_pnl(self, ):
        self.PnL = self.cash_balance + self.inventoryValue
        logging.debug(" [X] portfolio value: %d" % self.PnL)

    def _update_order_remain(self, exId, size):
        inId = self.exIds_to_inIds[exId]
        self.inIds_to_orders_confirmed[inId]["remainingQty"] -= size
        if self.inIds_to_orders_confirmed[inId]["remainingQty"] == 0:
            self.inIds_to_orders_confirmed.pop(inId)

    # only accept trade which belongs to this bot
    def _condition_to_accept_trade(self, tradeobj):
        exId = 0
        if tradeobj.buyOrderNo in list(self.exIds_to_inIds.keys()):
            print(self.exIds_to_inIds)
            with open(self.outputfile, "a") as myfile:
                myfile.write(
                    "Order %s : Buy Order %d is filled with quantity %d of price %s\n"
                    % (str(tradeobj.buyOrderNo),
                       self.exIds_to_inIds[tradeobj.buyOrderNo],
                       tradeobj.tradeSize, tradeobj.tradePrice))
            logging.info(
                "Order %s : Buy Order %d is filled with quantity %d of price %s\n"
                % (str(tradeobj.buyOrderNo),
                   self.exIds_to_inIds[tradeobj.buyOrderNo],
                   tradeobj.tradeSize, tradeobj.tradePrice))
            return tradeobj.buyOrderNo, 1
        elif tradeobj.sellOrderNo in list(self.exIds_to_inIds.keys()):
            print(self.exIds_to_inIds)
            logging.info(
                "Order %s : Sell Order %d is filled with quantity %d of price %s\n"
                % (str(tradeobj.sellOrderNo),
                   self.exIds_to_inIds[tradeobj.sellOrderNo],
                   tradeobj.tradeSize, tradeobj.tradePrice))
            with open(self.outputfile, "a") as myfile:
                myfile.write(
                    "Order %s : Sell Order %d is filled with quantity %d of price %s\n"
                    % (str(tradeobj.sellOrderNo),
                       self.exIds_to_inIds[tradeobj.sellOrderNo],
                       tradeobj.tradeSize, tradeobj.tradePrice))
            return tradeobj.sellOrderNo, -1
        else:
            return exId, 0

    def callback_for_trades(self, tradeobj):
        exId, side = self._condition_to_accept_trade(tradeobj)
        if side == -1 or side == 1:
            # uodate inventory, pnl, manage orders, decrease reamaining qty, if reamaining qty is 0, remove it from orders_confirmed
            self._update_with_trade(tradeobj, side, exId)
            self.steps = self.steps + 1

            self.gym_model.model_reaction_to_trade(tradeobj)

    def _update_with_ack(self, aMobj):
        inId = aMobj.internalOrderNo
        exId = aMobj.orderNo
        if aMobj.action == "A" and (inId in self.inIds_to_orders_sent):
            self.inIds_to_orders_confirmed[
                inId] = self.inIds_to_orders_sent.pop(inId)
            self.exIds_to_inIds[exId] = inId
            logging.info("ExId: %s -> InId: %s" % (exId, inId))
        elif aMobj.action == "D" and (inId in self.inIds_to_orders_confirmed):
            self.inIds_to_orders_sent[
                inId] = self.inIds_to_orders_confirmed.pop(inId)
            self.exIds_to_inIds[exId] = inId
            logging.info("ExId: %s -> InId: %s" % (exId, inId))

    #record orders which are not successfully sent or canceled in case you want to send them again and map exid to inid
    def callback_for_acks(self, aMobj):
        if (aMobj.strategy == self.strategy):
            self._update_with_ack(aMobj)

            self.gym_model.model_reaction_to_ack(aMobj)

    def _update_trend(self, trend, symbol, lv, oldprice, newprice):
        if newprice > oldprice:
            trend[symbol][lv] = 1
        elif newprice < oldprice:
            trend[symbol][lv] = -1
        else:
            trend[symbol][lv] = 0

    def _update_market_dict(self, tob):
        sym = tob["symb"]
        for lv in self.market_event_queue:
            if tob[lv +
                   "AskPrice"] is not None and tob[lv +
                                                   "BidPrice"] is not None:
                if self.market_dict[sym][lv] is not None:
                    self._update_trend(
                        self.bid_trend,
                        sym,
                        lv,
                        oldprice=self.market_dict[sym][lv][lv + "BidPrice"],
                        newprice=tob[lv + "BidPrice"])
                    self._update_trend(
                        self.ask_trend,
                        sym,
                        lv,
                        oldprice=self.market_dict[sym][lv][lv + "AskPrice"],
                        newprice=tob[lv + "AskPrice"])

                self.market_dict[sym][lv] = {
                    lv + "AskPrice": tob[lv + "AskPrice"],
                    lv + "BidPrice": tob[lv + "BidPrice"],
                    lv + "AskSize": tob[lv + "AskSize"],
                    lv + "BidSize": tob[lv + "BidSize"]
                }

        self.mid_market[sym] = 0.5 * (
            self.market_dict[sym]["L1"]["L1AskPrice"] +
            self.market_dict[sym]["L1"]["L1BidPrice"])
        #if (sym == "ZBH0:MBO"):
        #    print("\n"+sym + ": " +str( self.mid_market[sym]))
        #print(self.mid_market)
        # self._update_inventory_value()
        # self._update_pnl()

    # should be called when new level data arrives
    def callback_for_levels(self, tob):
        self._update_market_dict(tob)
        if tob["symb"] in self.securities:
            self._update_inventory_value()
            self._update_pnl()
            observation = np.array([v for v in self.mid_market.values()])
            orders = self.gym_model.model_reaction_to_level(
                observation, self.inventory)
            for order in orders:
                self.send_order(order)
예제 #48
0
 def __init__(self,
              rl_method='rl',
              stock_code=None,
              chart_data=None,
              training_data=None,
              min_trading_unit=1,
              max_trading_unit=2,
              delayed_reward_threshold=.05,
              net='dqn',
              num_steps=5,
              lr=0.001,
              value_network=None,
              policy_network=None,
              output_path='',
              reuse_models=True):
     # 인자 확인
     assert min_trading_unit > 0
     assert max_trading_unit > 0
     assert max_trading_unit >= min_trading_unit
     assert num_steps > 0
     assert lr > 0
     # 강화학습 기법 설정
     self.rl_method = rl_method
     # 환경 설정
     self.stock_code = stock_code
     self.chart_data = chart_data
     self.environment = Environment(chart_data)
     # 에이전트 설정
     self.agent = Agent(self.environment,
                        min_trading_unit=min_trading_unit,
                        max_trading_unit=max_trading_unit,
                        delayed_reward_threshold=delayed_reward_threshold)
     # 학습 데이터
     self.training_data = training_data
     self.sample = None
     self.training_data_idx = -1
     # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기
     self.num_features = self.agent.STATE_DIM
     if self.training_data is not None:
         self.num_features += self.training_data.shape[1]
     # 신경망 설정
     self.net = net
     self.num_steps = num_steps
     self.lr = lr
     self.value_network = value_network
     self.policy_network = policy_network
     self.reuse_models = reuse_models
     # 가시화 모듈
     self.visualizer = Visualizer()
     # 메모리
     self.memory_sample = []
     self.memory_action = []
     self.memory_reward = []
     self.memory_value = []
     self.memory_policy = []
     self.memory_pv = []
     self.memory_num_stocks = []
     self.memory_exp_idx = []
     self.memory_learning_idx = []
     # 에포크 관련 정보
     self.loss = 0.
     self.itr_cnt = 0
     self.exploration_cnt = 0
     self.batch_size = 0
     self.learning_cnt = 0
     # 로그 등 출력 경로
     self.output_path = output_path
예제 #49
0
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Arrays to keep track of rewards
    reward_history, timestep_history = [], []
    average_reward_history = []

    start = time.time()

    # Run actual training
    for episode_number in range(train_episodes):
        reward_sum, timesteps = 0, 0
        done = False
        # Reset the environment and observe the initial state
        observation = env.reset()

        # Loop until the episode is over
        while not done:
            # Get action from the agent
            action, action_probabilities = agent.get_action(observation)
            previous_observation = observation

            # Perform the action on the environment, get new state and reward
            observation, reward, done, info = env.step(action.detach().numpy())

            # Store action's outcome (so that the agent can improve its policy)
            agent.store_outcome(previous_observation, action_probabilities,
                                action, reward)

            # Store total episode reward
            reward_sum += reward
            timesteps += 1

        if print_things:
            print("Episode {} finished. Total reward: {:.3g} ({} timesteps)".
                  format(episode_number, reward_sum, timesteps))

        # Bookkeeping (mainly for generating plots)
        reward_history.append(reward_sum)
        timestep_history.append(timesteps)
        if episode_number > 100:
            avg = np.mean(reward_history[-100:])
        else:
            avg = np.mean(reward_history)
        average_reward_history.append(avg)

        # Let the agent do its magic (update the policy)
        agent.episode_finished(episode_number)

    # Training is finished - plot rewards
    if print_things:
        plt.plot(reward_history)
        plt.plot(average_reward_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history")
        plt.show()
        print("Training finished.")
        end = time.time()
        print("Running time: {:.04f}".format((end - start)))
    data = pd.DataFrame({
        "episode": np.arange(len(reward_history)),
        "train_run_id": [train_run_id] * len(reward_history),
        # TODO: Change algorithm name for plots, if you want
        "algorithm": ["PG"] * len(reward_history),
        "reward": reward_history
    })
    torch.save(agent.policy.state_dict(),
               "model_%s_%d.mdl" % (env_name, train_run_id))
    return data
예제 #50
0
from agent import Agent
from problem import Problem

problem = Problem()
#initial_state = [1,2,3,8,4,0,7,6,5] # really easy problem (1 step)
#initial_state = [1,2,3,8,6,0,7,5,4] # easy problem (3 steps)
#initial_state = [1,2,3,8,4,7,0,6,5] # mid problem (12 steps)
#initial_state = [1,3,2,4,0,7,8,6,5]
initial_state = [5, 2, 8, 4, 1, 7, 0, 3,
                 6]  # hard problem (22 steps) with initial state
# as [1,2,3,4,5,6,7,8,0]

agent = Agent(initial_state)
print('Estado atual:')
agent.print_state(initial_state)
agent.do_action(problem)
print('Resolução:')
while agent.solve_stack:
    agent.do_action(problem)
print('End')
예제 #51
0
def train_and_test_agent(environment_name='CarRacing-v0',
                         num_episodes=100,
                         max_test_length=5,
                         target_update_freq=1,
                         initial_test_freq=2,
                         replay_buffer_size=5e4,
                         training_batch_size=32,
                         learning_rate=0.001,
                         rewards_threshold=900,
                         action_step_length=3):

    # Set up environment and agent
    env = gym.make(environment_name)
    agent = Agent(env,
                  replay_buffer_size=replay_buffer_size,
                  learning_rate=learning_rate,
                  training_batch_size=training_batch_size,
                  num_actions=(5, 4, 4))

    # Set up logging
    meta_data = ({
        'env': environment_name,
        'target_update_freq': target_update_freq,
        'replay_buffer_size': replay_buffer_size,
        'training_batch_size': training_batch_size,
        'learning_rate': learning_rate
    })
    vars_to_track = ('episode', 'testing', 'epsilon', 'step_count', 'rewards')
    logger = GymLogger(meta_data, vars_to_track)

    # Start by filling replay buffer using random actions
    while len(agent.memory.replay_buffer) < 100:
        env.reset()
        state = None
        done = False
        while not done:
            action_num = np.random.choice(agent.num_actions)
            action = agent.action_space[action_num, :]
            next_state, reward, done, took_all_steps = take_steps(
                env, action, action_step_length)
            if took_all_steps and state is not None:
                if state.shape[2] == 9 and next_state.shape[2] == 9:  # Fix this
                    agent.memory.update(
                        (state, action_num, reward, next_state, done))
            state = next_state

    # Use testing flag to determine action selection below
    testing = False
    test_rewards = deque()
    best_test_rewards = 0

    for episode in range(num_episodes):

        # Throw away first few frames when camera is zooming in
        env.reset()
        for _ in range(10):
            state, _, _, _ = take_steps(env, (0, 1, 0), action_step_length)

        done = False
        episode_reward = 0
        step_count = 0
        while not done:
            if not testing:
                action_num, action = agent.determine_action(state)
            else:
                action_num, action = agent.act(state)
            next_state, reward, done, took_all_steps = take_steps(
                env, action, action_step_length)
            episode_reward += reward
            if took_all_steps and state is not None:
                if state.shape[2] == 9 and next_state.shape[2] == 9:  # Fix this
                    agent.memory.update(
                        (state, action_num, reward, next_state, done))
                    agent.learner.train(
                        agent.memory.random_sample(
                            num_samples=training_batch_size))
            state = next_state
            step_count += 1
            if done:
                logger.update((episode, testing, agent.epsilon, step_count,
                               episode_reward))

        if testing:

            test_rewards.append(episode_reward)
            mean_test_rewards = np.mean(test_rewards)
            print('episode: ' + str(episode))
            print('step count: ' + str(step_count))
            print('learning rate: ' + str(learning_rate))
            print('epsilon: ' + str(agent.epsilon))
            print('mean test rewards: ' + str(mean_test_rewards))
            print('test episodes: ' + str(len(test_rewards)))
            print('\n')

            if mean_test_rewards > best_test_rewards:
                print('New best test result')
                logger.save_model_weights(
                    agent.learner.target_model.get_weights(), 'best')
                best_test_rewards = mean_test_rewards

            if mean_test_rewards < rewards_threshold:
                testing = False
                test_rewards.clear()
            elif len(test_rewards) == max_test_length:
                print('Success!')
                logger.save_model_weights(
                    agent.learner.target_model.get_weights(), episode)
                break

        if (episode >= target_update_freq) and (episode % target_update_freq
                                                == 0):
            agent.learner.update_target()

        if (episode > initial_test_freq) and (episode %
                                              initial_test_freq) == 0:
            testing = True

        agent.udpate_epsilon()

    logger.save_history()
    env.close()
예제 #52
0
# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space
state = env_info.visual_observations[0]
print('States look like:', state)
print('States have shape:', state.shape)

state = process_observation(state, device)

# load the weights from file
agent = Agent(input_shape=state.shape[1:], action_size=action_size, seed=0)
agent.qnetwork_local.load_state_dict(
    torch.load('../checkpoints/dueling_checkpoint.pth'))

score = 0  # initialize the score

for i in range(3):
    env_info = env.reset(train_mode=False)[brain_name]

    state = env_info.visual_observations[0]  # get the current state
    state = process_observation(state, device)

    for j in range(2000):
        action = agent.act(state)
        env_info = env.step(action)[brain_name]
예제 #53
0
 def __init__(self):
     Agent.__init__(self)
LR_CRITIC = 1e-3  # learning rate of the critic
SEED = 0
TAU = 6e-2  # for soft update of target parameters
WEIGHT_DECAY = 0  # L2 weight decay
UPDATE_EVERY = 1  # time steps between network updates
#N_UPDATES = 1           # number of times training
ADD_NOISE = True

#eps_start = 6           # Noise level start
#eps_end = 0             # Noise level end
#eps_decay = 250         # Number of episodes to decay over from start to end

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("DEVICE:{}".format(device))
""" Setup two independent agents with shared experience memory """
agent_0 = Agent(state_size, action_size, num_agents=1, seed=SEED)
agent_1 = Agent(state_size, action_size, num_agents=1, seed=SEED)

n_episodes = 1000
scores_window = deque(maxlen=100)
scores_all = []
rolling_average = []
elapsed_time_list = []

list1 = []
list2 = []
list3 = []

for i_episode in range(0, n_episodes):
    # Start the clock
    start_time = time.time()
예제 #55
0
    def __init__(self,
                 load_model=True,
                 learning_rate=0.005,
                 min_trading_unit=0,
                 max_trading_unit=10,
                 delayed_reward_threshold=.01,
                 training=True):

        self.environment = Environment()
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)

        self.batch_size = 2
        self.update_freq = 4
        self.y = .99
        self.discount_factor = .8  #0.8**30 = 0.004
        self.startE = 1
        self.endE = 0.1
        self.anneling_steps = 10000.
        self.num_episodes = 10000
        self.pre_train_steps = 200
        self.max_epLength = 20
        self.replay_memory = 10
        self.training_step = 5

        self.load_model = load_model
        self.path = './dqn'

        # 모델을 세이브할 장소를 만든다.
        if not os.path.exists(self.path):
            os.makedirs(self.path)

        # self.h_size = 512
        self.tau = 0.001

        tf.reset_default_graph()

        self.network_type = [20, 25]  #, 6, 7]

        self.buffer_size = 0
        for image_type in self.network_type:
            image_size = 1
            for shape in self.environment.RANGE_SHAPE[image_type]:
                image_size *= shape
            self.buffer_size += image_size

        self.buffer_size = ((15 * (1024**3)) //
                            (self.buffer_size * 2 *
                             self.max_epLength)) // 10 * 10  #10GB / Imagesize
        print(self.buffer_size)
        self.mainQN = [
            Qnetwork(learning_rate=learning_rate,
                     model_type=type,
                     name='main_' + str(type)) for type in self.network_type
        ]
        if training:
            self.targetQN = [
                Qnetwork(learning_rate=learning_rate,
                         model_type=type,
                         name='target_' + str(type))
                for type in self.network_type
            ]
        '''
예제 #56
0
import gym
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from agent import Agent

env = gym.make('CartPole-v1')

agent = Agent(env.observation_space.shape[0], env.action_space.n, seed=0, batch_size=32, q_size=25)

env.seed(0)

def DQN(num_episodes = 7500, max_iteration = 1000, init_epsilon = 1.0, min_epsilon = 0.05, decay = 0.999):
    '''

    :param num_episodes:
    :param max_iteration:
    :param init_epsilon:
    :param min_epsilon:
    :param decay:
    :return:
    '''


    total_reward = []
    total_reward_window = deque(maxlen=100)
    epsilon = init_epsilon
예제 #57
0
def agent():
    from agent import Agent

    agent = Agent(action_space=[0, 1, 2, 3, 4])
    return agent
예제 #58
0
class ReinforcementLearner:
    __metaclass__ = abc.ABCMeta
    lock = threading.Lock()

    def __init__(self,
                 rl_method='rl',
                 stock_code=None,
                 chart_data=None,
                 training_data=None,
                 min_trading_unit=1,
                 max_trading_unit=2,
                 delayed_reward_threshold=.05,
                 net='dqn',
                 num_steps=5,
                 lr=0.001,
                 value_network=None,
                 policy_network=None,
                 output_path='',
                 reuse_models=True):
        # 인자 확인
        assert min_trading_unit > 0
        assert max_trading_unit > 0
        assert max_trading_unit >= min_trading_unit
        assert num_steps > 0
        assert lr > 0
        # 강화학습 기법 설정
        self.rl_method = rl_method
        # 환경 설정
        self.stock_code = stock_code
        self.chart_data = chart_data
        self.environment = Environment(chart_data)
        # 에이전트 설정
        self.agent = Agent(self.environment,
                           min_trading_unit=min_trading_unit,
                           max_trading_unit=max_trading_unit,
                           delayed_reward_threshold=delayed_reward_threshold)
        # 학습 데이터
        self.training_data = training_data
        self.sample = None
        self.training_data_idx = -1
        # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기
        self.num_features = self.agent.STATE_DIM
        if self.training_data is not None:
            self.num_features += self.training_data.shape[1]
        # 신경망 설정
        self.net = net
        self.num_steps = num_steps
        self.lr = lr
        self.value_network = value_network
        self.policy_network = policy_network
        self.reuse_models = reuse_models
        # 가시화 모듈
        self.visualizer = Visualizer()
        # 메모리
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0
        # 로그 등 출력 경로
        self.output_path = output_path

    def init_value_network(self,
                           shared_network=None,
                           activation='linear',
                           loss='mse'):
        if self.net == 'lstm':
            self.value_network = LSTMNetwork(input_dim=self.num_features,
                                             output_dim=self.agent.NUM_ACTIONS,
                                             lr=self.lr,
                                             num_steps=self.num_steps,
                                             shared_network=shared_network,
                                             activation=activation,
                                             loss=loss)

        if self.reuse_models and \
            os.path.exists(self.value_network_path):
            self.value_network.load_model(model_path=self.value_network_path)

    def init_policy_network(self,
                            shared_network=None,
                            activation='sigmoid',
                            loss='binary_crossentropy'):
        if self.net == 'lstm':
            self.policy_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr,
                num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation,
                loss=loss)
        if self.reuse_models and \
            os.path.exists(self.policy_network_path):
            self.policy_network.load_model(model_path=self.policy_network_path)

    def reset(self):
        self.sample = None
        self.training_data_idx = -1
        # 환경 초기화
        self.environment.reset()
        # 에이전트 초기화
        self.agent.reset()
        # 가시화 초기화
        self.visualizer.clear([0, len(self.chart_data)])
        # 메모리 초기화
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        self.memory_learning_idx = []
        # 에포크 관련 정보 초기화
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        self.learning_cnt = 0

    def build_sample(self):
        self.environment.observe()
        if len(self.training_data) > self.training_data_idx + 1:
            self.training_data_idx += 1
            self.sample = self.training_data.iloc[
                self.training_data_idx].tolist()
            self.sample.extend(self.agent.get_states())
            return self.sample
        return None

    @abc.abstractmethod
    def get_batch(self, batch_size, delayed_reward, discount_factor):
        pass

    def update_networks(self, batch_size, delayed_reward, discount_factor):
        # 배치 학습 데이터 생성
        x, y_value, y_policy = self.get_batch(batch_size, delayed_reward,
                                              discount_factor)
        if len(x) > 0:
            loss = 0
            if y_value is not None:
                # 가치 신경망 갱신
                loss += self.value_network.train_on_batch(x, y_value)
            if y_policy is not None:
                # 정책 신경망 갱신
                loss += self.policy_network.train_on_batch(x, y_policy)
            return loss
        return None

    def fit(self, delayed_reward, discount_factor, full=False):
        batch_size = len(self.memory_reward) if full \
            else self.batch_size
        # 배치 학습 데이터 생성 및 신경망 갱신
        if batch_size > 0:
            _loss = self.update_networks(batch_size, delayed_reward,
                                         discount_factor)
            if _loss is not None:
                self.loss += abs(_loss)
                self.learning_cnt += 1
                self.memory_learning_idx.append(self.training_data_idx)
            self.batch_size = 0

    def visualize(self, epoch_str, num_epoches, epsilon):
        self.memory_action = [Agent.ACTION_HOLD] \
            * (self.num_steps - 1) + self.memory_action
        self.memory_num_stocks = [0] * (self.num_steps - 1) \
            + self.memory_num_stocks
        if self.value_network is not None:
            self.memory_value = [np.array([np.nan] \
                * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                    + self.memory_value
        if self.policy_network is not None:
            self.memory_policy = [np.array([np.nan] \
                * len(Agent.ACTIONS))] * (self.num_steps - 1) \
                    + self.memory_policy
        self.memory_pv = [self.agent.initial_balance] \
            * (self.num_steps - 1) + self.memory_pv
        self.visualizer.plot(
            epoch_str=epoch_str,
            num_epoches=num_epoches,
            epsilon=epsilon,
            action_list=Agent.ACTIONS,
            actions=self.memory_action,
            num_stocks=self.memory_num_stocks,
            outvals_value=self.memory_value,
            outvals_policy=self.memory_policy,
            exps=self.memory_exp_idx,
            learning_idxes=self.memory_learning_idx,
            initial_balance=self.agent.initial_balance,
            pvs=self.memory_pv,
        )

        self.visualizer.save(
            os.path.join(self.epoch_summary_dir,
                         'epoch_summary_{}.png'.format(epoch_str)))

    def run(self,
            num_epoches=100,
            balance=10000000,
            discount_factor=0.9,
            start_epsilon=0.5,
            learning=True):
        info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \
            "DF:{discount_factor} TU:[{min_trading_unit}," \
            "{max_trading_unit}] DRT:{delayed_reward_threshold}".format(
            code=self.stock_code, rl=self.rl_method, net=self.net,
            lr=self.lr, discount_factor=discount_factor,
            min_trading_unit=self.agent.min_trading_unit,
            max_trading_unit=self.agent.max_trading_unit,
            delayed_reward_threshold=self.agent.delayed_reward_threshold
        )
        with self.lock:
            logging.info(info)

        # 시작 시간
        time_start = time.time()

        # 가시화 준비
        # 차트 데이터는 변하지 않으므로 미리 가시화
        self.visualizer.prepare(self.environment.chart_data, info)

        # 가시화 결과 저장할 폴더 준비
        self.epoch_summary_dir = os.path.join(
            self.output_path, 'epoch_summary_{}'.format(self.stock_code))
        if not os.path.isdir(self.epoch_summary_dir):
            os.makedirs(self.epoch_summary_dir)
        else:
            for f in os.listdir(self.epoch_summary_dir):
                os.remove(os.path.join(self.epoch_summary_dir, f))

        # 에이전트 초기 자본금 설정
        self.agent.set_balance(balance)

        # 학습에 대한 정보 초기화
        max_portfolio_value = 0
        epoch_win_cnt = 0

        # 학습 반복
        for epoch in range(num_epoches):
            time_start_epoch = time.time()

            # step 샘플을 만들기 위한 큐
            q_sample = collections.deque(maxlen=self.num_steps)

            # 환경, 에이전트, 신경망, 가시화, 메모리 초기화
            self.reset()

            # 학습을 진행할 수록 탐험 비율 감소
            if learning:
                epsilon = start_epsilon \
                    * (1. - float(epoch) / (num_epoches - 1))
                self.agent.reset_exploration()
            else:
                epsilon = start_epsilon

            while True:
                # 샘플 생성
                next_sample = self.build_sample()
                if next_sample is None:
                    break

                # num_steps만큼 샘플 저장
                q_sample.append(next_sample)
                if len(q_sample) < self.num_steps:
                    continue

                # 가치, 정책 신경망 예측
                pred_value = None
                pred_policy = None
                if self.value_network is not None:
                    pred_value = self.value_network.predict(list(q_sample))
                if self.policy_network is not None:
                    pred_policy = self.policy_network.predict(list(q_sample))

                # 신경망 또는 탐험에 의한 행동 결정
                action, confidence, exploration = \
                    self.agent.decide_action(
                        pred_value, pred_policy, epsilon)

                # 결정한 행동을 수행하고 즉시 보상과 지연 보상 획득
                immediate_reward, delayed_reward = \
                    self.agent.act(action, confidence)

                # 행동 및 행동에 대한 결과를 기억
                self.memory_sample.append(list(q_sample))
                self.memory_action.append(action)
                self.memory_reward.append(immediate_reward)
                if self.value_network is not None:
                    self.memory_value.append(pred_value)
                if self.policy_network is not None:
                    self.memory_policy.append(pred_policy)
                self.memory_pv.append(self.agent.portfolio_value)
                self.memory_num_stocks.append(self.agent.num_stocks)
                if exploration:
                    self.memory_exp_idx.append(self.training_data_idx)

                # 반복에 대한 정보 갱신
                self.batch_size += 1
                self.itr_cnt += 1
                self.exploration_cnt += 1 if exploration else 0

                # 지연 보상 발생된 경우 미니 배치 학습
                if learning and (delayed_reward != 0):
                    self.fit(delayed_reward, discount_factor)

            # 에포크 종료 후 학습
            if learning:
                self.fit(self.agent.profitloss, discount_factor, full=True)

            # 에포크 관련 정보 로그 기록
            num_epoches_digit = len(str(num_epoches))
            epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0')
            time_end_epoch = time.time()
            elapsed_time_epoch = time_end_epoch - time_start_epoch
            if self.learning_cnt > 0:
                self.loss /= self.learning_cnt
            logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} "
                         "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} "
                         "#Stocks:{} PV:{:,.0f} "
                         "LC:{} Loss:{:.6f} ET:{:.4f}".format(
                             self.stock_code, epoch_str, num_epoches, epsilon,
                             self.exploration_cnt, self.itr_cnt,
                             self.agent.num_buy, self.agent.num_sell,
                             self.agent.num_hold, self.agent.num_stocks,
                             self.agent.portfolio_value, self.learning_cnt,
                             self.loss, elapsed_time_epoch))

            # 에포크 관련 정보 가시화
            self.visualize(epoch_str, num_epoches, epsilon)

            # 학습 관련 정보 갱신
            max_portfolio_value = max(max_portfolio_value,
                                      self.agent.portfolio_value)
            if self.agent.portfolio_value > self.agent.initial_balance:
                epoch_win_cnt += 1

        # 종료 시간
        time_end = time.time()
        elapsed_time = time_end - time_start

        # 학습 관련 정보 로그 기록
        with self.lock:
            logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} "
                         "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format(
                             code=self.stock_code,
                             elapsed_time=elapsed_time,
                             max_pv=max_portfolio_value,
                             cnt_win=epoch_win_cnt))

        return self.memory_pv

    def save_models(self):
        if self.value_network is not None and \
                self.value_network_path is not None:
            self.value_network.save_model(self.value_network_path)
        if self.policy_network is not None and \
                self.policy_network_path is not None:
            self.policy_network.save_model(self.policy_network_path)
예제 #59
0
def one_player():
    board = Board()
    agent = Agent(board, PLAYER_X, exploration_rate=0)
    agent.q_values = pickle.load(open("model/tic-tac-toe-agent-x-epochs-5000.pickle", "rb"))
    game = Game()
    game.one_player(board, agent)
예제 #60
0
    # create a pyglet window and set glOptions
    win = window.Window(width=500, height=500, vsync=True, resizable=True)
    glEnable(GL_BLEND)
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
    # needed so that egi knows where to draw
    egi.InitWithPyglet(win)
    # prep the fps display
    fps_display = clock.ClockDisplay()
    # register key and mouse event handlers
    win.push_handlers(on_key_press)
    win.push_handlers(on_mouse_press)
    win.push_handlers(on_resize)

    # create a world for agents
    world = World(500, 500)
    # add one agent
    world.agents.append(Agent(world))
    # unpause the world ready for movement
    world.paused = False

    while not win.has_exit:
        win.dispatch_events()
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
        # show nice FPS bottom right (default)
        delta = clock.tick()
        world.update(delta)
        world.render()
        fps_display.draw()
        # swap the double buffer
        win.flip()