def process(self): Agent.process(self) postponed = self._postponed_messages[:] self._postponed_messages = list() for m in postponed: self._process_message(m)
def main(): """ 来源和目标的默认设置,可以自己修改。 """ agent=Agent() #从新闻网站获取新闻的SimpleWebSource: url='http://news.163.com/' sourcename='网易' starttag='<!-- 头条区 -->' endtag='<div class="ns-wnews ns-recommand mb30" id="nsRecForYou"></div>' subtag='target="_blank"' titlePattern=r'<a(.*?)href="(http://.*?\.163\.com.*?)">(.*?)</a>' contentPattern=r''' <h1 id="h1title" class="ep-h1">(.*?)</h1> [\s\S]*? <div class="ep-time-soure cDGray">(.*?) 来源 [\s\S]*? <div id="endText" class="end-text"> ([\s\S]*?) 本文来源:(.*?)</span> ''' netease=NeteaseSource(url,sourcename,starttag, endtag,subtag,titlePattern,contentPattern) #增加纯文本目标和HTML目标 agent.addSource(netease) #发布新闻项目 agent.distribute()
def test_startup_and_shutdown(): # Create an agent that throws an exception when it receives # a payload command packet. a = Agent() a.bind_udp_sockets() a.service_handler["Payload Command"] = Agent.raise_exception # Run agent. t = threading.Thread(target=Agent.run, args=(a,)) t.daemon = True t.start() # Send an ACK packet p = Packet() p.service = Supernova.service_id("Payload Command") p.dest_node = Supernova.get_my_id() p.ack = 1 Send.send_to_self(p) # Wait for and then assert that thread has *not* exited. t.join(0.01) assert t.is_alive() # Send a payload command packet -- SHUTDOWN p = Packet() p.service = Supernova.service_id("Payload Command") p.dest_node = Supernova.get_my_id() Send.send_to_self(p) # Wait for and then assert that thread has exited. t.join(0.01) assert not t.is_alive()
def solve(system,initV = None, gamma = 0.9): numNodes = system.network.numNodes numTrt = Agent.numTrt(system) numValidTrt = Agent.numValidTrt(numNodes,numTrt) if initV is None: initV = np.zeros((1 << numNodes,)) it = 0 maxIt = 1000 tol = 1e-8 cont = True v0 = initV while cont: v1 = ValueIteration.operT(system,gamma,v0) it += 1 if np.linalg.norm(v1 - v0,2) < tol or it == maxIt: cont = False v0 = v1 if it == maxIt: raise ValueError("ValueIteration hit iteration limit") return v0
def test_timeout(): # Create an agent that throws an exception when it receives # a payload command packet. a = Agent() a.bind_udp_sockets() a.service_handler["Payload Command"] = Agent.raise_exception # Set a timeout that is << delay. Agent.TIMEOUT = 0.005 # Run agent. t = threading.Thread(target=Agent.run, args=(a,)) t.daemon = True t.start() # Delay time.sleep(0.02) # Send a payload command packet -- SHUTDOWN p = Packet() p.service = Supernova.service_id("Payload Command") p.dest_node = Supernova.get_my_id() Send.send_to_self(p) # Wait for and then assert that thread has exited. t.join(0.01) assert not t.is_alive()
def setUp(self): exchange1 = Exchange() exchange2 = Exchange() self.location1 = Location(exchange1) self.location2 = Location(exchange2) self.agent1 = Agent(location=self.location1) self.agent2 = Agent(location=self.location2)
def run(args): logging.basicConfig(filename=args.LOG_FILE, level=logging.DEBUG) logging.getLogger().addHandler(logging.StreamHandler()) game_handler = GameStateHandler(random_seed=123, frame_skip=args.FRAME_SKIP, use_sdl=False, image_processing=lambda x: crop_and_resize(x, args.IMAGE_HEIGHT, args.IMAGE_WIDTH)) game_handler.loadROM(args.ROM_FILE) height, width = game_handler.getScreenDims() logging.info('Screen resolution is %dx%d' % (height, width)) num_actions = game_handler.num_actions net = theano_qnetwork.DeepQNetwork(args.IMAGE_HEIGHT, args.IMAGE_WIDTH, num_actions, args.STATE_FRAMES, args.DISCOUNT_FACTOR) replay_memory = ReplayMemoryManager(args.IMAGE_HEIGHT, args.IMAGE_WIDTH, args.STATE_FRAMES, args.REPLAY_MEMORY_SIZE) monitor = Monitoring(log_train_step_every=100, smooth_episode_scores_over=50) agent = Agent(game_handler, net, replay_memory, None, monitor, args.TRAIN_FREQ, batch_size=args.BATCH_SIZE) start_epsilon = args.START_EPSILON exploring_duration = args.EXPLORING_DURATION agent.populate_replay_memory(args.MIN_REPLAY_MEMORY) agent.play(train_steps_limit=args.LEARNING_BEYOND_EXPLORING+args.EXPLORING_DURATION, start_eps=start_epsilon, final_eps=args.FINAL_EPSILON, exploring_duration=exploring_duration)
def __init__(self, aid, booksList): Agent.__init__(self, aid) self.booksList = booksList comportamento = ComportamentoAgenteLivraria(self) self.behaviours.append(comportamento)
class Environment(): def __init__(self): env = gym.make(ENV) self.env = wrappers.Monitor(env, '/tmp/gym/mountaincar_dqn', force=True) self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.n self.agent = Agent(self.num_states, self.num_actions) def run(self): complete_episodes = 0 episode_final = False output = open('result.log', 'w') print(self.num_states, self.num_actions) for episode in range(NUM_EPISODE): observation = self.env.reset() state = torch.from_numpy(observation).type(torch.FloatTensor) state = torch.unsqueeze(state, 0) for step in range(MAX_STEPS): if episode_final: self.env.render(mode='rgb_array') action = self.agent.get_action(state, episode) observation_next, _, done, _ = self.env.step(action.item()) state_next = torch.from_numpy(observation_next).type(torch.FloatTensor) state_next = torch.unsqueeze(state_next, 0) reward = torch.FloatTensor([0.0]) if done: state_next = None if 199 <= step: reward = torch.FloatTensor([-1.0]) complete_episodes = 0 else: reward = torch.FloatTensor([1.0]) complete_episodes = complete_episodes + 1 self.agent.memory(state, action, state_next, reward) self.agent.update_q_function() state = state_next if done: message = 'episode: {0}, step: {1}'.format(episode, step) print(message) output.write(message + '\n') break if episode_final: break if 10 <= complete_episodes: print('success 10 times in sequence') # episode_final = True self.env.close() output.close()
def main(game_name, lr, num_agents, update_target_every, model_name, tau): assert 'NoFrameskip-v4' in game_name if 'soft' in model_name: update_target_every = 1 basename = '{}:lr={}:na={}:ute={}:{}'.format( game_name[:-14], lr, num_agents, update_target_every, model_name) if 'soft' in model_name: basename += ':tau={}'.format(tau) env = Agent(num_agents, game_name, basename) try: estimator = get_estimator(model_name, env.action_n, lr, 0.99, tau=tau) base_path = os.path.join(train_path, basename) print("start training!!") dqn(env, estimator, base_path, batch_size=32, epsilon=0.01, save_model_every=1000, update_target_every=update_target_every, learning_starts=200, memory_size=100000, num_iterations=40000000) except KeyboardInterrupt: print("\nKeyboard interrupt!!") except Exception: traceback.print_exc() finally: env.close()
def __init__(self, player_id, own_dice_list): Agent.__init__(self, player_id, own_dice_list) self.num_each_fv = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} for fv in self.own_dice_list: self.num_each_fv[fv] += 1 self.pg = ProbGenerator((NUM_PLAYERS-1)*NUM_DICE) self.pg.calc()
def run(N): """ Runs N episodes of a given length and then runs a demo with greedy policy """ agent = Agent() data = read_data('./data/q.dat') if data is not None: agent.Q = data for i in range(N): bot = Bot() run_episode(bot, agent, None, draw=False, policy='eps_greedy') # if bot.center[1] > 7: print "robot moved on: %i steps" % bot.center[1] pg.init() pg.display.init() surf = pg.display.set_mode((800, 600)) surf.fill((0, 0, 0)) pg.display.flip() print "Surf1:", surf bot = Bot() bot.info() run_episode(bot, agent, surf, draw=True, policy='eps_greedy', episode_len=60) print "Robot's moves:\n", bot.path print "Robot walked %i m" % bot.center[1] print "Last state value=%.1f" % agent.get_state_value(bot.get_state()) write_data(agent.Q, "data/q.dat") write_path(agent.Q_values, "data/path.csv")
def react(self, message): Agent.react(self, message) display_message(self.aid.name, 'Uma mensagem recebida') if 'agente_teste_participante' in self.aid.name: resposta = message.create_reply() resposta.set_content('Ola tambem agente!') self.send(resposta)
def __init__(self, name, fg, ms, opt): Agent.__init__(self, name, fg, ms, opt) self.f = self.fg.functions[self.name] self.neighbors = self.f.variables self.domains = {v:self.fg.variables[v].domain for v in self.neighbors} self.q = {v:{value:0 for value in self.domains[v]} for v in self.neighbors} self.terminated_neighbors = {v:False for v in self.neighbors}
def __init__(self, aid): Agent.__init__(self, aid) message = ACLMessage(ACLMessage.REQUEST) message.set_protocol(ACLMessage.FIPA_REQUEST_PROTOCOL) message.set_content('REQUEST') message.add_receiver('agent_participant_1') comportamento_1 = RequestIniciante(self, message) self.addBehaviour(comportamento_1)
def __init__(self, name, fg, ms, opt): Agent.__init__(self, name, fg, ms, opt) self.v = self.fg.variables[self.name] self.neighbors = self.v.functions self.domain = self.v.domain self.z = {value:0 for value in self.domain} self.r = {f:{value:0 for value in self.domain} for f in self.neighbors} self.z_queue = []
def valueIteration(discountFactor): # all locations in grid alllocations = [ (x,y) for x in range(11) for y in range(11)] # initialize values values = {} bestMoves = {} for predloc in alllocations: for preyloc in alllocations: if preyloc != predloc: values[(predloc,preyloc)] = 0 agent = Agent(0,0) deltas = [] epsilon = 0.01 delta = 1 numIt = 0 # perform value iteration according to pseud-code while delta > epsilon: delta = 0 newValues = {} # loop over all states for predloc in alllocations: for preyloc in alllocations: if predloc == preyloc: continue agent.setLocation(predloc) prey = Prey(*preyloc) temp = values[(predloc,preyloc)] # find optimal value according to current values bestVal = 0 bestMove = (0,0) for prob, predMove in agent.getMoveList(): preySum = 0 newPredloc = ((predloc[0] + predMove[0])%11,(predloc[1] + predMove[1])%11) if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestMove = predMove newValues[(predloc,preyloc)] = bestVal bestMoves[(predloc,preyloc)] = bestMove delta = max(delta, np.abs(bestVal - temp)) values = newValues deltas.append(delta) numIt+=1 # greedy policy to the optimal values computed above def policy(state): predloc, preyloc = state agent.setLocation(predloc) prey = Prey(*preyloc) return bestMoves[(predloc,preyloc)] return numIt, values, policy
def hello_world(): get_ntp_time() e = threading.Event() ip, port, message = "10.1.1.2", 9999, 'Hello World' interval = 1 counter = 20 t1 = Agent(e,interval,counter,ip,port,message) t1.start() return message
def valueIteration(): alldiffs = [ (x,y) for x in range(-5,6) for y in range(-5,6)] alldiffs.remove((0,0)) # the relative positions vary from -5 up to 5, in both dimensions values = {} for x in range(-5,6): for y in range(-5,6): values[(x,y)] = 0 bestMoves = {} agent = Agent(0,0) deltas = [] discountFactor = 0.8 epsilon = 0.01 delta = 1 while delta > epsilon: delta = 0 newValues = {} for diff in alldiffs: # we place the predator in the middle of the world, # we are allowed to do this, since the positions are encoded relatively predloc = (5,5) preyloc = (predloc[0]+diff[0],predloc[1]+diff[1]) curKey = rewriteStates(predloc,preyloc) agent.setLocation(predloc) prey = Prey(*preyloc) temp = values[curKey] bestVal = 0 bestMove = (0,0) for prob, predMove in agent.getMoveList(): preySum = 0 newPredloc = agent.locAfterMove(predMove) if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): # using rewriteStates we use relative positions preySum += preyProb * discountFactor * values[rewriteStates(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestMove = predMove newValues[curKey] = bestVal bestMoves[curKey] = bestMove delta = max(delta, np.abs(bestVal - temp)) values = newValues deltas.append(delta) def policy(state): predloc, preyloc = state agent.setLocation(predloc) prey = Prey(*preyloc) return bestMoves[rewriteStates(predloc,preyloc)] return policy
def _process_message(self, message): for activity in self._activities: try: result = activity(message) self._activities.remove(activity) return result except MatchError: pass Agent._process_message(self, message)
def __init__(self, aid, bookStores): Agent.__init__(self, aid) self.bookStores = bookStores self.bestPropose = None self.bestBookStore = None self.proposes = [] self.messages = [] self.sends = 0 self.receives = 0
def _run(self): def sigterm_clean(signum, frame): try: os.kill(os.getpid(), signal.SIGKILL) except: pass signal.signal(signal.SIGTERM, sigterm_clean) agent = Agent() agent.main()
def create_agent(self, device): datapath_id = device.datapath_id device_id = device.id for controller_endpoint in self.controller_endpoints: agent = Agent(controller_endpoint, datapath_id, device_id, self.grpc_client, self.enable_tls, self.key_file, self.cert_file) agent.start() self.agent_map[(datapath_id,controller_endpoint)] = agent self.device_id_to_datapath_id_map[device_id] = datapath_id
def ctl_panic(*args): """Ask master to engage panic mode.""" def success(result): print "Panic mode requested..." agent=Agent() d=agent.panic() d.addCallback(success) return d
def ctl_recover(*args): """Ask master to recover from panic mode.""" def success(result): print "Recovering from panic mode... Please check logs." agent=Agent() d=agent.recover() d.addCallback(success) return d
def __init__(self, player_id): Agent.__init__(self, player_id) #self.num_each_fv = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} #for fv in self.own_dice_list: # self.num_each_fv[fv] += 1 self.pg = ProbGenerator(NUM_PLAYERS*NUM_DICE) self.pg.calc() self.good_bid_count = 0 self.num_bids_made = 0 self.bad_bid_count = 0
def on_start(self): Agent.on_start(self) display_message(self.aid.name, "Hello World") if 'agente_teste_iniciante' in self.aid.name: message = ACLMessage(ACLMessage.INFORM) message.add_receiver('agente_teste_participante') message.set_content('Ola Agente!') self.send(message) display_message(self.aid.name, 'Enviando mensagem...')
def __init__(self, aid): Agent.__init__(self, aid) pedido = {'tipo' : 'pedido', 'qtd' : 100.0} message = ACLMessage(ACLMessage.CFP) message.set_protocol(ACLMessage.FIPA_CONTRACT_NET_PROTOCOL) message.set_content(dumps(pedido)) message.add_receiver('participant_agent_1') message.add_receiver('participant_agent_2') behaviour = InitiatorProtocol(self, message) self.addBehaviour(behaviour)
def operT(system,gamma,v): numNodes = system.network.numNodes numTrt = Agent.numTrt(system) numValidTrt = Agent.numValidTrt(numNodes,numTrt) vForA = np.zeros((1 << numNodes, numValidTrt)) for aInd in range(numValidTrt): P,R = ValueIteration.calcPAndR(system,aInd) vForA[:,aInd] = (R + gamma * (P.dot(v))) return np.amax(vForA,1)
def ctl_dump(*args): """Dump current internal memory.""" def success(result): import pprint pprint.pprint(result) agent=Agent() d=agent.getDump() d.addCallback(success) return d
def create_widgets(self): for i in range(15): for j in range(15): f = tk.Frame(self, height=50, width=50) f.pack_propagate(0) f.grid(row=i, column=j, padx=0, pady=0) self.frames.append(f) b = tk.Label(f, image=self.image[0], bg="yellow") b.pack(fill=tk.BOTH, expand=1) b.bind("<Button-1>", self.click(i, j)) self.button.append(b) root = tk.Tk() root.wm_title("Alpha Gomoku") root.attributes("-topmost", True) with tf.Session() as sess: parser = argparse.ArgumentParser() parser.add_argument("model_name", type=str) parser.add_argument("--chkpnt", "-c", type=int) parser.add_argument("--ensemble", "-e", action="store_true") args = parser.parse_args() if args.model_name == "minimax": agent = MinimaxAgent(max_depth=6, max_width=6) elif args.model_name == "mininet": agent = MCTSMinimaxAgent(sess, "supervised", chkpnt=args.chkpnt) else: agent = Agent(sess, args.model_name, chkpnt=args.chkpnt) app = Application(agent, root, ensemble=args.ensemble) app.mainloop()
from agent import Agent from utils import plot_learning_curve, make_env import torch as T from gym import wrappers env = make_env('PongNoFrameskip-v4') best_score = -np.inf load_checkpoint = True n_games = 1 agent = Agent(gamma=0.99, epsilon=0.1, lr=0.0001, input_dims=(env.observation_space.shape), n_actions=(env.action_space.n), mem_size=1, eps_min=0.1, batch_size=32, replace=1000, eps_dec=1e-5, checkpoint_dir='models/', algo='DuelingDQNAgent', env_name='PongNoFrameskip-v4') agent.load_models() print(agent.q_eval) #env = wrappers.Monitor(env, "tmp/dqn-video", video_callable=lambda episode_id: True, force=True) n_steps = 0 score = 0 done = False
from agent import Agent from funcs import playMatches run_version = 1 player1version = 10 player2version = 50 EPISODES = 7 logger = loggers.logger_tourney turns_until_tau0 = 0 env = Game() network = ResCNN(config.REG_CONST, config.LEARNING_RATE, env.input_shape, env.action_size, config.HIDDEN_CNN_LAYERS) network.load(env.name, run_version, player1version) player1 = Agent('player1', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, network) network.load(env.name, run_version, player2version) player2 = Agent('player2', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, network) print('Players are ready, Tourney begins!') goes_first = 0 scores, memory, points, sp_scores = playMatches(player1, player2, EPISODES, logger, turns_until_tau0, None, goes_first) print(scores) print(points) print(sp_scores)
def __init__(self): self._agent = Agent()
) env_eval_callback = instantiate_eval_callback(env_name=args.env_name) if not args.stub_agent: agent = Agent( algo_name=args.algo_name, env_name=args.env_name, log_to_tensorboard=args.log_to_tensorboard, tb_log_name=args.tb_log_name, train_total_timesteps=args.train_total_timesteps, n_eval_episodes=args.n_eval_episodes, render=args.render, num_envs=args.num_envs, model_to_load=args.model_to_load, continue_learning=args.continue_learning, discrete_action_space=args.discrete_action_space, eval_callback=args.eval_callback, env_variables=env_variables, continue_learning_suffix=args.continue_learning_suffix, env_eval_callback=env_eval_callback, show_progress_bar=args.show_progress_bar, log_every=args.log_every, sb_version=args.sb_version, save_model=args.save_model, save_replay_buffer=args.save_replay_buffer, model_suffix=args.model_suffix, ) else: agent = AgentStub( algo_name=args.algo_name, env_name=args.env_name,
'...') m_tmp = best_NN.read(env.name, initialise.INITIAL_RUN_NUMBER, best_player_version) #current_NN.model.set_weights(m_tmp.get_weights()) best_NN.model.set_weights(m_tmp.get_weights()) #otherwise just ensure the weights on the two players are the same else: best_player_version = 0 best_NN.model.set_weights(current_NN.model.get_weights()) #copy the config file to the run folder copyfile('./config.py', run_folder + 'config.py') #plot_model(current_NN.model, to_file=run_folder + 'models/model.png', show_shapes = True) print('\n') ######## CREATE THE PLAYERS ######## #current_player = Agent('current_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, current_NN) best_player = Agent('best_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, best_NN) human_player = WindowUser('lzqt', env.state_size, env.action_size) game = PlayWithAI(human_player, best_player, lg.logger_main, turns_until_tau0=config.TURNS_UNTIL_TAU0, memory=memory) game.initial() game.loop()
class Tablero: def __init__(self): self.dimensiones = (0,0) self.casillas = [] self.agent = Agent() self.accion = Accion() self.pos_mario = None self.pos_tuberias = [] self.restricciones_posiciones = [ lambda posicion: posicion[0]<= self.dimensiones[0] and posicion[0] >= 0, # Coordenadas en x validas lambda posicion: posicion[1]<= self.dimensiones[1] and posicion[1] >= 0, # Coordenadas en y validas ] self.restricciones_casillas = [ lambda casilla: not casilla.es_muro, # No es una casilla muro lambda casilla: not casilla.visitado, # No es una casilla visitada lambda casilla: not casilla.es_tuberia # No es una casilla tuberia ] def ponerMurosTuberiasYMario(self, pos_muros, pos_tuberias, pos_mario): pos_x = 1 for fila in self.casillas: pos_y = 1 for casilla in fila: if (pos_x, pos_y) in pos_muros: casilla.es_muro=True if (pos_x, pos_y) in pos_tuberias: casilla.es_tuberia=True if (pos_x, pos_y) == pos_mario: casilla.es_mario=True pos_y += 1 pos_x += 1 def crearTableroPorParametros(self, dimension_x, dimension_y, pos_muros, pos_tuberias, pos_mario): # definimos propiedades utiles de la clase disminuyendo en uno, debido a la adaptacion de indices que inician en 0 self.pos_tuberias = map(lambda posicion: (posicion[0]-1, posicion[1]-1) ,pos_tuberias) self.pos_mario = (pos_mario[0]-1, pos_mario[1]-1) self.dimensiones = (dimension_x-1, dimension_y-1) # Las casillas se van a formar con las dimensiones proporcionadas # Teniendo una lista de listas de la clase Casilla self.casillas = [[Casilla() for y in range(dimension_y)] for x in range(dimension_x)] self.ponerMurosTuberiasYMario(pos_muros, pos_tuberias, pos_mario) def definirPosicionesDeMurosYMario(self): for indice_x, fila in enumerate(self.casillas): for indice_y, elem in enumerate(fila): if elem.es_tuberia: self.pos_tuberias.append((indice_x, indice_y)) elif elem.es_mario: self.pos_mario = (indice_x, indice_y) def crearTableroPorMapa(self, mapa): # Se obtinen las dimensiones dimension_x = len(mapa) - 1 dimension_y = len(mapa[0]) - 1 self.dimensiones = (dimension_x, dimension_y) # Se recorre la matriz de simbolos para crear la matriz de Casillas for fila in mapa: nueva_fila = [] for elem in fila: # para crear nuestra matriz de Casillas aprovechamos para marcar su tipo (muro, tuberia, mario) casilla_aux = Casilla() casilla_aux.asignarTipo(elem) # Agregamos la casilla a la fila nueva_fila.append(casilla_aux) #Agregamos la fila a la matriz self.casillas.append(nueva_fila) self.definirPosicionesDeMurosYMario() def mostrarCasillas(self): dimension_x = self.dimensiones[0] dimension_y = self.dimensiones[1] # mostrando cabecera con indices verticales del tablero print(' ',end='') for i in range(dimension_y + 1): print(f'__{i + 1}_', end='') print() pos_x = 1 # Luego se muestra el indice lateral y los valores por fila for fila in self.casillas: print(f'{pos_x} |', end='') for casilla in fila: print(f' {casilla.representacion()} |',end='') pos_x += 1 print() # Se usa el algoritmo BFS def habilitarSucesores(self, sucesores): habilitados = [] for sucesor in sucesores: if all([restriccion(sucesor) for restriccion in self.restricciones_posiciones]): casilla = self.casillas[sucesor[0]][sucesor[1]] if all([restriccion(casilla) for restriccion in self.restricciones_casillas]): casilla.visitado = True habilitados.append(sucesor) return habilitados def designarPadreASucesores(self, sucesores, padre): casilla_padre = self.casillas[padre[0]][padre[1]] for sucesor in sucesores: casilla_hijo = self.casillas[sucesor[0]][sucesor[1]] if casilla_hijo.valor == 0 or casilla_hijo.valor > casilla_padre.valor + 1: casilla_hijo.valor = casilla_padre.valor + 1 casilla_hijo.designarPadre(padre) def limpiarVisitados(self): for fila in self.casillas: for casilla in fila: casilla.visitado = False def expandirSucesores(self, estado): acciones = [self.accion.arriba, self.accion.abajo, self.accion.derecha, self.accion.izquierda] sucesores = self.agent.funcion_transicion(estado, acciones) sucesores = self.habilitarSucesores(sucesores) self.designarPadreASucesores(sucesores, estado) return sucesores def busqueda_de_estados_BFS(self, estados_iniciales): # conjunto de colas para varios BFSs colas = [] colas.extend([[estado_inicial] for estado_inicial in estados_iniciales]) #cerrado = [] num_expansion = 1 while sum([len(cola) for cola in colas]) != 0: self.mostrarCasillas() print(f"Expansion numero {num_expansion}:") num_expansion += 1 for cola in colas: if len(cola) != 0: estado = cola.pop(0) cola.extend(self.expandirSucesores(estado)) self.limpiarVisitados() def resolver(self): self.busqueda_de_estados_BFS(self.pos_tuberias) def caminoParaMario(self ): # Primero añadimos la posicion de mario camino_nodos = [] camino_nodos.append(self.pos_mario) # Definimos casilla iterable #print(self.pos_mario) casilla_actual = self.casillas[self.pos_mario[0]][self.pos_mario[1]] num_saltos_requeridos = casilla_actual.valor # Recorremos el camino while(casilla_actual.padre != None): camino_nodos.append(casilla_actual.padre) casilla_actual = self.casillas[casilla_actual.padre[0]][casilla_actual.padre[1]] # retornamos solucion return (num_saltos_requeridos, camino_nodos) def mostrarCaminoMario(self): num_saltos, camino_nodos = self.caminoParaMario() print(f"Mario necesita {num_saltos} pasos para llegar a la tuberia mas cercana") print(f"A traves de las siguientes posiciones de casillas\ {' -> '.join([repr((posicion[0] + 1, posicion[1] + 1)) for posicion in camino_nodos])}")
from environment import Environment from agent import Agent env = Environment() agent = Agent() FRAMES_TO_RUN = 10 for i in range(FRAMES_TO_RUN): env.update_state() env.print_state()
class PolicyLearner: def __init__(self, stock_code, chart_data, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, lr=0.01, tax=False): self.stock_code = stock_code # Stock coder self.chart_data = chart_data self.environment = Environment(chart_data) # Environment object self.tax = tax # Agent object self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold, tax=tax) self.training_data = training_data # Training data self.sample = None self.training_data_idx = -1 # Policy neural network; Input size = size of training data + agent state size self.num_features = self.training_data.shape[1] + self.agent.STATE_DIM self.AC = ACagent(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=lr) self.visualizer = Visualizer() # Visualization module def reset(self): self.sample = None self.training_data_idx = -1 def fit(self, num_epoches=1000, max_memory=60, balance=10000000, discount_factor=0, start_epsilon=.5, learning=True, monkey=False): logging.info( "\n\nAcotr LR: {Alr}, Critic LR: {Clr}, DF: {discount_factor}, " "TU: [{min_trading_unit}, {max_trading_unit}], " "DRT: {delayed_reward_threshold}, Tax: {tax}".format( Alr=self.AC.actor_lr, Clr=self.AC.critic_lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold, tax=self.tax)) # Visualization Preparation # Pre-visualization the chart data as it does not change self.visualizer.prepare(self.environment.chart_data) # Prepare the folders to store visualization results epoch_summary_dir = os.path.join( settings.BASE_DIR, 'epoch_summary/%s/epoch_summary_%s' % (self.stock_code, settings.timestr)) if not os.path.isdir(epoch_summary_dir): os.makedirs(epoch_summary_dir) # Set agent's initial balance self.agent.set_balance(balance) # Initialize the information about training max_portfolio_value = 0 epoch_win_cnt = 0 # Training repetition for epoch in range(num_epoches): # Initialize the information about epoch #loss = 0. itr_cnt = 0 win_cnt = 0 exploration_cnt = 0 batch_size = 0 # Initialize the memory memory_sample = [] memory_action = [] memory_reward = [] memory_prob = [] memory_pv = [] memory_num_stocks = [] memory_exp_idx = [] memory_learning_idx = [] # Initialize the environment, agent and policy nerual network self.environment.reset() self.agent.reset() self.AC.reset() self.reset() # Initialize the visualizer self.visualizer.clear([0, len(self.chart_data)]) # Exploration rate decreases as you progress if monkey: epsilon = 1 else: if learning: epsilon = start_epsilon * (1. - float(epoch) / (num_epoches - 1)) else: epsilon = 0 while True: # Sample generation next_sample = self._build_sample() if next_sample is None: break # Actions decided by policy neural network or exploration action, confidence, exploration = self.agent.decide_action( self.AC, self.sample, epsilon) # Perform the action you decided and earn immediate and delayed rewards immediate_reward, delayed_reward = self.agent.act( action, confidence) # Store the actions and the consequences for the actions memory_sample.append(next_sample) memory_action.append(action) memory_reward.append(immediate_reward) memory_pv.append(self.agent.portfolio_value) memory_num_stocks.append(self.agent.num_stocks) memory = [ (memory_sample[i], memory_action[i], memory_reward[i]) for i in list(range(len(memory_action)))[-max_memory:] ] if exploration: memory_exp_idx.append(itr_cnt) memory_prob.append([np.nan] * Agent.NUM_ACTIONS) else: memory_prob.append(self.AC.prob) # Update the information about iterations batch_size += 1 itr_cnt += 1 exploration_cnt += 1 if exploration else 0 win_cnt += 1 if delayed_reward > 0 else 0 # Update policy neural network when in training mode and delay rewards exist if delayed_reward == 0 and batch_size >= max_memory: delayed_reward = immediate_reward self.agent.base_portfolio_value = self.agent.portfolio_value if learning and delayed_reward != 0: # Size of batch traning data batch_size = min(batch_size, max_memory) # Generate batch training data x, _ = self._get_batch(memory, batch_size, discount_factor, delayed_reward) if len(x) > 0: # Update Policy neural network self.AC.train_model(self.sample, action, delayed_reward, next_sample) memory_learning_idx.append([itr_cnt, delayed_reward]) batch_size = 0 # Visualize the information about epoches num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') self.visualizer.plot(epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=memory_action, num_stocks=memory_num_stocks, outvals=memory_prob, exps=memory_exp_idx, learning=memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=memory_pv) self.visualizer.save( os.path.join( epoch_summary_dir, 'epoch_summary_%s_%s.png' % (settings.timestr, epoch_str))) logging.info("[Epoch {}/{}]\tEpsilon:{}\t#Expl.:{}/{}\t" "#Buy:{}\t#Sell:{}\t#Hold:{}\t" "#Stocks:{}\tPV:{:,}원\t".format( epoch_str, num_epoches, round(epsilon, 4), exploration_cnt, itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, int(self.agent.portfolio_value))) # Update the information about training max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # Record the information about training in log logging.info("Max PV: {:,}원, \t # Win: {}".format( int(max_portfolio_value), epoch_win_cnt)) def _get_batch(self, memory, batch_size, discount_factor, delayed_reward): x = np.zeros((batch_size, 1, self.num_features)) y = np.full((batch_size, self.agent.NUM_ACTIONS), 0.5) for i, (sample, action, _) in enumerate(reversed(memory[-batch_size:])): x[i] = np.array(sample).reshape((-1, 1, self.num_features)) y[i, action] = (delayed_reward + 1) / 2 if discount_factor > 0: y[i, action] *= discount_factor**i return x, y def _build_sample(self): self.environment.observe() if len(self.training_data) > self.training_data_idx + 1: self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() self.sample.extend(self.agent.get_states()) return self.sample return None def trade(self, model_path=None, balance=2000000): if model_path is None: return self.AC.load_model(model_path=model_path) self.fit(balance=balance, num_epoches=1, learning=False)
def main(): #usage arg1 : -s to save, arg2 source comp file, arg3 last test file_read = "./stats.txt" if (len(sys.argv) > 2): file_read = sys.argv[2] end = 156 if (len(sys.argv) > 3): end = int(sys.argv[3]) + 1 last_attempt = [None] f = open(file_read, "r").read() for i in f.split("\n")[:-1]: try: info = re.sub( r"\(.*?\)", "", i.split("duration")[1].replace("[ms]", "").replace("steps", "")).split(":") last_attempt.append((int(info[0]), int(info[1]))) except: last_attempt.append(((99999), (9999))) try: last_attempt_info = json.loads(f.split("\n")[-1]) except: last_attempt_info = { "beaten": 0, "avg time[ms]": 99999, "max time[ms]": 99999, "min time[ms]": 99999, "avg steps": 99999, "max steps": 99999, "min steps": 99999 } for i in range(len(last_attempt), 156): last_attempt.append(((99999), (9999))) current_attempt = [None] file_save = None if (len(sys.argv) > 1 and sys.argv[1] == "-s"): file_save = open("./stats_new.txt", "w") for i in range(1, end): start_time = time() #_original_stdout = sys.stdout #sys.stdout = open(os.devnull, 'w') steps = asyncio.run( Agent(open(f"levels/{i}.xsb").read()).solve(300, float("inf"))) #sys.stdout.close() #sys.stdout = _original_stdout #print(">>", steps, "<<") elapsed_time = time() - start_time if (steps != None): current_attempt.append((int(elapsed_time * 1000), len(steps))) time_str = "{:>9}".format((str(int(elapsed_time * 1000))) + " ") + \ "({:>9})".format( (str((int(elapsed_time * 1000)) - last_attempt[i][0]))) level_str = "{:<2}".format(i) steps_str = "{:>6}".format(len(steps)) + \ "({:>6})".format((len(steps) - last_attempt[i][1])) if (file_save): file_save.write( f"level {level_str} - duration[ms] {time_str}: steps {steps_str}\n" ) file_save.flush() print( f"level {level_str} - duration[ms] {time_str}: steps {steps_str}" ) else: current_attempt.append(None) if (file_save): file_save.write(f"level {i} - Timed out\n") file_save.flush() print(f"level {i} - Timed out") times = [i[0] for i in current_attempt if i is not None] stepss = [i[1] for i in current_attempt if i is not None] beaten = [i for i in current_attempt if i is not None] info = { "beaten": len(beaten), "avg time[ms]": int(sum(times) / len(times)), "max time[ms]": max(times), "min time[ms]": min(times), "avg steps": int(sum(stepss) / len(stepss)), "max steps": max(stepss), "min steps": min(stepss), "diff beaten": len(beaten) - last_attempt_info["beaten"], "diff avg time[ms]": int(sum(times) / len(times)) - last_attempt_info["avg time[ms]"], "diff max time[ms]": max(times) - last_attempt_info["max time[ms]"], "diff min time[ms]": min(times) - last_attempt_info["min time[ms]"], "diff avg steps": int(sum(stepss) / len(stepss)) - last_attempt_info["avg steps"], "diff max steps": max(stepss) - last_attempt_info["max steps"], "diff min steps": min(stepss) - last_attempt_info["min steps"], } if (file_save): file_save.write(json.dumps(info)) file_save.flush() print(json.dumps(info))
start = time.time() def updateMemory(agent): location = 'trainlocally/multi' fileNames = [f"{location}/Data/{f}" for f in listdir(f"{location}/Data/") if isfile(f"{location}/Data/{f}")] if len(fileNames) == 0: return fileNames.sort(key=lambda x: getsize(x), reverse=True) fileName = fileNames[0] if getsize(fileName) == 0: return try: with open(fileName, 'rb') as file: memory = pickle.load(file) remove(fileName) for mem in memory: agent.remember(*mem) except Exception as e: print(e) return agent = Agent(memory=30000) i = 0 while time.time() - start < 300: updateMemory(agent) print(i) time.sleep(1) i += 1
def main(): epsilon, discount, alpha, iterations, selfPlay, readValues = get_args( sys.argv) if selfPlay == True: agent1 = Agent(1, epsilon, discount, alpha) agent2 = Agent(-1, epsilon, discount, alpha) print( "Beginning self play. Corresponding state values will be stored in agent1_values.txt and agent2_values.txt" ) for i in range(iterations): print("Iteration %d..." % (i)) self_play(agent1, agent2) agent1.write_qvalues('agent1_values.txt') agent2.write_qvalues('agent2_values.txt') elif readValues == True: token = 0 ai = 0 while (True): token = input("What piece would you like to be (X/O)") if token == "X" or token == "O": break if token == "X": token = 1 ai = Agent(token * -1, epsilon=.2, discount=.7, alpha=.7, readValues=True, file="./agent2_values.txt") else: token = -1 ai = Agent(token * -1, epsilon=.2, discount=.7, alpha=.7, readValues=True, file="./agent1_values.txt") human = Human(token) if token == 1: # human is X play_human_vs_ai(human, ai, token) ai.write_qvalues("agent2_values.txt") else: play_human_vs_ai(ai, human, token) ai.write_qvalues("agent1_values.txt")
def __init__(self, boxHeight, boxWidth, boxMargin, rowCount, columnCount, master=None, alpha=0.1, gamma=0.9, epsilon=0.1, episode_count=500, game_sleep=0.1, training_sleep=0.01): super().__init__(master) self.colorBlack = '#000000' self.windowTitle = 'Run Forrest Run!! The RL Game' self.colorBlack = '#000000' self.colorWhite = '#FAFAFA' self.colorGray = '#B9B9B9' self.colorRunner = '#0A9F23' self.colorChaser = '#9F0A0A' self.colorRunnerPlaceConfiguration = '#B5FFB9' self.colorChaserPlaceConfiguration = '#FFB5B5' self.colorRockPlaceConfiguration = '#B5ECFF' self.colorRunnerBehaviourConfiguration = '#F8B5FF' self.colorRockCountConfiguration = '#FFFFB5' self.colorTurnCountConfiguration = '#B5CFFF' self.buttonGrassText = 'grass' self.buttonRockText = 'rock' self.buttonRunnerText = 'runner' self.buttonChaser1Text = 'chaser1' self.buttonChaser2Text = 'chaser2' self.boxFont = ("Calibri", 22) self.buttonFont = ("Calibri", 12) self.windowHeight = 302 + boxHeight * (rowCount + 1) + boxMargin * (rowCount + 2) self.windowWidth = boxWidth * (columnCount + 1) + boxMargin * (columnCount + 2) self.boxWidth = boxWidth self.boxHeight = boxHeight self.boxMargin = boxMargin self.rowCount = rowCount self.columnCount = columnCount self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.game_sleep = game_sleep self.training_sleep = training_sleep self.boardTopMargin = self.windowHeight - ( self.rowCount + 1) * self.boxHeight - (self.rowCount + 2) * self.boxMargin self.generalHeight = (self.boardTopMargin - 8 * self.boxMargin) / 8 self.configurationFrameHeaderWidth = (self.windowWidth - 3 * self.boxMargin) * 0.5 self.configurationFrameHeaderX = self.boxMargin self.configurationFrameHeaderY = self.boxMargin self.consoleHeaderX = self.boxMargin * 2 + self.configurationFrameHeaderWidth self.consoleHeaderY = self.boxMargin self.configurationFrameEltWidth = (self.configurationFrameHeaderWidth - 2 * self.boxMargin) / 3 self.window = master self.runnerController = tk.IntVar() self.runnerController.set(BehaviorConfig.Auto.value) self.runnerPlaceController = tk.IntVar() self.runnerPlaceController.set(PlaceConfig.Random.value) self.chaserPlaceController = tk.IntVar() self.chaserPlaceController.set(PlaceConfig.Random.value) self.obstaclePlaceController = tk.IntVar() self.obstaclePlaceController.set(PlaceConfig.Random.value) self.obstacleCount = tk.IntVar() self.obstacleCount.set(20) self.obstacleCounter = 0 self.turnCount = tk.IntVar() self.turnCount.set(100) self.turnCounter = 0 self.window.title(self.windowTitle) self.window.resizable(0, 0) self.window.geometry( str(self.windowWidth) + 'x' + str(self.windowHeight)) self.window.wm_iconphoto( False, ImageTk.PhotoImage(Image.open('image/runner.png'))) self.window.configure(bg=self.colorGray) self.imageGrass = tk.PhotoImage(file='image/grass.png') self.imageObstacle = tk.PhotoImage(file='image/obstacle.png') self.imageRunner = tk.PhotoImage(file='image/runner.png') self.imageChaser1 = tk.PhotoImage(file='image/chaser1.png') self.imageChaser2 = tk.PhotoImage(file='image/chaser2.png') self.buttonImage = { ButtonType.Grass: self.imageGrass, ButtonType.Obstacle: self.imageObstacle, ButtonType.Runner: self.imageRunner, ButtonType.Chaser1: self.imageChaser1, ButtonType.Chaser2: self.imageChaser2 } self.state = [] self.original_agent_states = {} self.waitingJob = WaitingJob.ApplyConfiguration self.isRunnerCaught = False self.default_q_table = pd.DataFrame( 0, index=pd.MultiIndex.from_product([ list(range(1, rowCount + 1)), list(range(1, columnCount + 1)) ]), columns=['NO MOVE', 'NORTH', 'EAST', 'SOUTH', 'WEST']) #self.runner = Agent(0, 0, ButtonType.Runner, self.default_q_table.copy()) #self.chaser1 = Agent(0, 0, ButtonType.Chaser1, self.default_q_table.copy()) #self.chaser2 = Agent(0, 0, ButtonType.Chaser2, self.default_q_table.copy()) self.runner = Agent( 0, 0, ButtonType.Runner, pd.read_pickle('pickle/runner_' + str(self.rowCount) + '_' + str(self.columnCount) + '.pkl')) self.chaser1 = Agent( 0, 0, ButtonType.Chaser1, pd.read_pickle('pickle/chaser1_' + str(self.rowCount) + '_' + str(self.columnCount) + '.pkl')) self.chaser2 = Agent( 0, 0, ButtonType.Chaser2, pd.read_pickle('pickle/chaser2_' + str(self.rowCount) + '_' + str(self.columnCount) + '.pkl')) self.runnerDefaultPlace = (1, 1) self.chaser1DefaultPlace = (self.rowCount, self.columnCount) self.chaser2DefaultPlace = (self.rowCount, self.columnCount - 1) self.applyConfigButtonList = [] self.isTraining = False self.episode_count = episode_count self.episode_counter = 0 counter = 0 while counter <= self.columnCount: self.window.rowconfigure(counter, weight=1) self.window.columnconfigure(counter, weight=1) counter += 1 self.configurationsHeaderLabel = self.makeLabel( self.window, self.configurationFrameHeaderX, self.configurationFrameHeaderY, self.generalHeight, self.configurationFrameHeaderWidth, text='Configurations', bg=self.colorWhite) self.runnerPlaceConfigurationLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 2 + self.generalHeight * 1, self.generalHeight, self.configurationFrameEltWidth, text='Runner Place', bg=self.colorRunnerPlaceConfiguration) self.runnerPlaceDefaultRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 3 + self.generalHeight * 2, self.generalHeight, self.configurationFrameEltWidth, text='Default', bg=self.colorRunnerPlaceConfiguration, value=PlaceConfig.Default.value, variable=self.runnerPlaceController, justify=tk.LEFT) self.runnerPlaceRandomRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 4 + self.generalHeight * 3, self.generalHeight, self.configurationFrameEltWidth, text='Random', bg=self.colorRunnerPlaceConfiguration, value=PlaceConfig.Random.value, variable=self.runnerPlaceController) self.runnerPlaceManualRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 5 + self.generalHeight * 4, self.generalHeight, self.configurationFrameEltWidth, text='Manual', bg=self.colorRunnerPlaceConfiguration, value=PlaceConfig.Manual.value, variable=self.runnerPlaceController) self.chaserPlaceConfigurationLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 2 + self.generalHeight * 1, self.generalHeight, self.configurationFrameEltWidth, text='Chaser Place', bg=self.colorChaserPlaceConfiguration) self.chaserPlaceDefaultRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 3 + self.generalHeight * 2, self.generalHeight, self.configurationFrameEltWidth, text='Default', bg=self.colorChaserPlaceConfiguration, value=PlaceConfig.Default.value, variable=self.chaserPlaceController, justify=tk.LEFT) self.chaserPlaceRandomRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 4 + self.generalHeight * 3, self.generalHeight, self.configurationFrameEltWidth, text='Random', bg=self.colorChaserPlaceConfiguration, value=PlaceConfig.Random.value, variable=self.chaserPlaceController) self.chaserPlaceManualRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 5 + self.generalHeight * 4, self.generalHeight, self.configurationFrameEltWidth, text='Manual', bg=self.colorChaserPlaceConfiguration, value=PlaceConfig.Manual.value, variable=self.chaserPlaceController) self.rockPlaceConfigurationLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 2 + self.boxMargin * 3, self.boxMargin * 2 + self.generalHeight * 1, self.generalHeight, self.configurationFrameEltWidth, text='Rock Place', bg=self.colorRockPlaceConfiguration) self.rockPlaceDefaultRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 2 + self.boxMargin * 3, self.boxMargin * 3 + self.generalHeight * 2, self.generalHeight, self.configurationFrameEltWidth, text='Default', bg=self.colorRockPlaceConfiguration, value=PlaceConfig.Default.value, variable=self.obstaclePlaceController, justify=tk.LEFT) self.rockPlaceRandomRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 2 + self.boxMargin * 3, self.boxMargin * 4 + self.generalHeight * 3, self.generalHeight, self.configurationFrameEltWidth, text='Random', bg=self.colorRockPlaceConfiguration, value=PlaceConfig.Random.value, variable=self.obstaclePlaceController) self.rockPlaceManualRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 2 + self.boxMargin * 3, self.boxMargin * 5 + self.generalHeight * 4, self.generalHeight, self.configurationFrameEltWidth, text='Manual', bg=self.colorRockPlaceConfiguration, value=PlaceConfig.Manual.value, variable=self.obstaclePlaceController) self.runnerBehaviorConfigurationLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 6 + self.generalHeight * 5, self.generalHeight, self.configurationFrameEltWidth, text='Runner Behavior', bg=self.colorRunnerBehaviourConfiguration) self.runnerBehaviorAutoRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 6 + self.generalHeight * 5, self.generalHeight, self.configurationFrameEltWidth, text='Auto', bg=self.colorRunnerBehaviourConfiguration, value=BehaviorConfig.Auto.value, variable=self.runnerController) self.runnerBehaviorManualRB = self.makeRadioButton( self.window, self.configurationFrameEltWidth * 2 + self.boxMargin * 3, self.boxMargin * 6 + self.generalHeight * 5, self.generalHeight, self.configurationFrameEltWidth, text='Manual', bg=self.colorRunnerBehaviourConfiguration, value=BehaviorConfig.Manual.value, variable=self.runnerController) self.turnCountLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 7 + self.generalHeight * 6, self.generalHeight, self.configurationFrameEltWidth, text='Turn Count', bg=self.colorTurnCountConfiguration) self.turnCountSpinbox = self.makeSpinbox( self.window, self.configurationFrameEltWidth * 0 + self.boxMargin * 1, self.boxMargin * 8 + self.generalHeight * 7, self.generalHeight, self.configurationFrameEltWidth, textvariable=self.turnCount, from_=10, to=1000) self.obstacleCountLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 7 + self.generalHeight * 6, self.generalHeight, self.configurationFrameEltWidth, text='Obstacle Count', bg=self.colorRockCountConfiguration) self.obstacleCountSpinbox = self.makeSpinbox( self.window, self.configurationFrameEltWidth * 1 + self.boxMargin * 2, self.boxMargin * 8 + self.generalHeight * 7, self.generalHeight, self.configurationFrameEltWidth, textvariable=self.obstacleCount, from_=1, to=50) self.applyConfigurationButton = self.makeButton( self.window, self.configurationFrameEltWidth * 2 + self.boxMargin * 3, self.boxMargin * 7 + self.generalHeight * 6, self.generalHeight * 2 + self.boxMargin, self.configurationFrameEltWidth, ButtonType.ApplyConfig, text='Apply\nConfiguration', bg='blue', fg=self.colorWhite, font=self.buttonFont) self.console = self.makeTextbox( self.window, self.consoleHeaderX, self.consoleHeaderY, self.windowHeight - (self.rowCount + 6) * self.boxMargin - (self.rowCount + 1) * self.boxHeight - self.generalHeight * 3, self.configurationFrameHeaderWidth, bg='light yellow') self.runnerScoreLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 3 + self.boxMargin * 4, self.boxMargin * 6 + self.generalHeight * 5, self.generalHeight, self.configurationFrameEltWidth, text='Runner Score', bg=self.colorWhite) self.runnerScoreBoard = self.makeLabel( self.window, self.configurationFrameEltWidth * 3 + self.boxMargin * 4, self.boxMargin * 7 + self.generalHeight * 6, self.generalHeight, self.configurationFrameEltWidth, text='0', bg=self.colorWhite) self.chaser1ScoreLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 4 + self.boxMargin * 5, self.boxMargin * 6 + self.generalHeight * 5, self.generalHeight, self.configurationFrameEltWidth, text='Chaser 1 Score', bg=self.colorWhite) self.chaser1ScoreBoard = self.makeLabel( self.window, self.configurationFrameEltWidth * 4 + self.boxMargin * 5, self.boxMargin * 7 + self.generalHeight * 6, self.generalHeight, self.configurationFrameEltWidth, text='0', bg=self.colorWhite) self.chaser2ScoreLabel = self.makeLabel( self.window, self.configurationFrameEltWidth * 5 + self.boxMargin * 6, self.boxMargin * 6 + self.generalHeight * 5, self.generalHeight, self.configurationFrameEltWidth, text='Chaser 2 Score', bg=self.colorWhite) self.chaser2ScoreBoard = self.makeLabel( self.window, self.configurationFrameEltWidth * 5 + self.boxMargin * 6, self.boxMargin * 7 + self.generalHeight * 6, self.generalHeight, self.configurationFrameEltWidth, text='0', bg=self.colorWhite) self.startGameButton = self.makeButton( self.window, self.configurationFrameEltWidth * 3 + self.boxMargin * 4, self.boxMargin * 8 + self.generalHeight * 7, self.generalHeight, self.configurationFrameEltWidth * 1.5 + self.boxMargin * 1, ButtonType.StartGame, text='Start Game', bg='blue', fg=self.colorWhite, font=self.buttonFont, state='disabled') self.trainAgentsButton = self.makeButton( self.window, self.configurationFrameEltWidth * 4.5 + self.boxMargin * 5, self.boxMargin * 8 + self.generalHeight * 7, self.generalHeight, self.configurationFrameEltWidth * 1.5 + self.boxMargin * 1, ButtonType.StartGame, text='Train Agents', bg='yellow', fg=self.colorWhite, font=self.buttonFont, state='disabled') self.applyConfigurationButton.bind('<1>', self.handleEvent) self.startGameButton.bind('<1>', self.handleEvent) self.trainAgentsButton.bind('<1>', self.trainAgents) self.applyConfigButtonList.append(self.runnerPlaceDefaultRB) self.applyConfigButtonList.append(self.runnerPlaceRandomRB) self.applyConfigButtonList.append(self.runnerPlaceManualRB) self.applyConfigButtonList.append(self.chaserPlaceDefaultRB) self.applyConfigButtonList.append(self.chaserPlaceRandomRB) self.applyConfigButtonList.append(self.chaserPlaceManualRB) self.applyConfigButtonList.append(self.rockPlaceDefaultRB) self.applyConfigButtonList.append(self.rockPlaceRandomRB) self.applyConfigButtonList.append(self.rockPlaceManualRB) self.applyConfigButtonList.append(self.runnerBehaviorAutoRB) self.applyConfigButtonList.append(self.runnerBehaviorManualRB) self.applyConfigButtonList.append(self.turnCountSpinbox) self.applyConfigButtonList.append(self.obstacleCountSpinbox) self.applyConfigButtonList.append(self.applyConfigurationButton) self.elts = [] self.appendToConsole('Welcome to the RL Game - Run Forrest Run!!') self.appendToConsole('Waiting for configuration...') self.initializeElts() self.initializeState() self.window.mainloop()
from agent import Agent import matplotlib.pyplot as plt import random random.seed(1) import numpy as np from utils import compute_mse, Trace if __name__ == '__main__': """ test sarsa lambda algorithm """ """ the learning curve of mean-squared error against episode number for lambda = 0 and lambda = 1 """ env = Environment() agent = Agent(env) print( 'the learning curve of mean-squared error against episode number for') print('lambda = 0') agent.td_learning(10000, 0.0, True, trace=Trace.accumulating) agent.reset() print('lambda = 1') agent.td_learning(10000, 1.0, True, trace=Trace.accumulating) agent.reset() print('The mean-squared error against lambda') monte_carlo_iterations = 1000000 td_iterations = 10000 agent.monte_carlo_control(monte_carlo_iterations)
class Environment: def __init__(self, Double, Dueling, PER): self.env = gym.make(ENV) # 태스크를 설정 num_states = self.env.observation_space.shape[ 0] # 태스크의 상태 변수 수(4)를 받아옴 num_actions = self.env.action_space.n # 태스크의 행동 가짓수(2)를 받아옴 self.Double = Double self.Dueling = Dueling self.PER = PER self.agent = Agent(num_states, num_actions, Double, Dueling, PER) # 에이전트 역할을 할 객체를 생성 self.NumEpisode = [] self.AvgSteps = [] def run(self): '''실행''' episode_10_list = np.zeros( 10) # 최근 10에피소드 동안 버틴 단계 수를 저장함 (평균 단계 수를 출력할 때 사용) complete_episodes = 0 # 현재까지 195단계를 버틴 에피소드 수 episode_final = False # 마지막 에피소드 여부 frames = [] # 애니메이션을 만들기 위해 마지막 에피소드의 프레임을 저장할 배열 for episode in range(NUM_EPISODES): # 최대 에피소드 수만큼 반복 observation = self.env.reset() # 환경 초기화 state = observation # 관측을 변환 없이 그대로 상태 s로 사용 state = torch.from_numpy(state).type( torch.FloatTensor) # NumPy 변수를 파이토치 Tensor로 변환 state = torch.unsqueeze(state, 0) # size 4를 size 1*4로 변환 for step in range(MAX_STEPS): # 1 에피소드에 해당하는 반복문 #if episode_final is True: # 마지막 에피소드에서는 각 시각의 이미지를 frames에 저장한다. # frames.append(self.env.render(mode='rgb_array')) action = self.agent.get_action(state, episode) # 다음 행동을 결정 # 행동 a_t를 실행해 다음 상태 s_{t+1}과 done 플래그 값을 결정 # action에 .item()을 호출해 행동 내용을 구함 observation_next, _, done, _ = self.env.step( action.item()) # reward와 info는 사용하지 않으므로 _로 처리 # 보상을 부여하고 episode의 종료 판정 및 state_next 를 설정 if done: # 단계 수가 200을 넘었거나 봉이 일정 각도 이상 기울면 done이 True가 됨 state_next = None # 다음 상태가 없으므로 None으로 # 최근 10 에피소드에서 버틴 단계 수를 리스트에 저장 episode_10_list = np.hstack( (episode_10_list[1:], step + 1)) if step < 195: #if step < 295: reward = torch.FloatTensor( [-1.0]) # 도중에 봉이 쓰러졌다면 페널티로 보상 -1을 부여 complete_episodes = 0 # 연속 성공 에피소드 기록을 초기화 else: reward = torch.FloatTensor( [1.0]) # 봉이 서 있는 채로 에피소드를 마쳤다면 보상 1 부여 complete_episodes = complete_episodes + 1 # 연속 성공 에피소드 기록을 갱신 # 그림 그리기 위해 저장 self.NumEpisode.append(episode) self.AvgSteps.append(episode_10_list[-1]) else: reward = torch.FloatTensor([0.0]) # 그 외의 경우는 보상 0을 부여 state_next = observation_next # 관측 결과를 그대로 상태로 사용 state_next = torch.from_numpy(state_next).type( torch.FloatTensor) # NumPy 변수를 파이토치 텐서로 변환 state_next = torch.unsqueeze(state_next, 0) # size 4를 size 1*4ㄹㄹ로 변환 # 메모리에 경험을 저장 self.agent.memorize(state, action, state_next, reward) # TD 오차 메모리에 TD 오차를 저장 # Prioritized Experience Replay 에서 추가됨 if self.PER == True: self.agent.memorize_td_error(0) # 여기서는 정확한 값 대신 0을 저장함 # Experience Replay로 Q함수를 수정 if self.PER == True: self.agent.update_q_function(episode) else: self.agent.update_q_function() # 관측 결과를 업데이트 state = state_next # 에피소드 종료 처리 if done: print('DQN with Double : %r, Dueling : %r, PER : %r' % (self.Double, self.Dueling, self.PER)) print( '%d Episode: Finished after %d steps : 최근 10 에피소드의 평균 단계 수 = %.1lf' % (episode, step + 1, episode_10_list.mean())) # PER - TD 오차 메모리의 TD 오차를 업데이트 if self.PER == True: self.agent.update_td_error_memory() # DDQN if (episode % 2 == 0): self.agent.update_target_q_function() break if episode_final is True: # 애니메이션 생성 및 저장 #display_frames_as_gif(frames) break # 10 에피소드 연속으로 195단계를 버티면 태스크 성공 if complete_episodes >= 5: print( '---- DQN with Double : %r, Dueling : %r, PER : %r ----' % (self.Double, self.Dueling, self.PER)) print('10 에피소드 연속 성공') print( '------------------------------------------------------' ) # 그림 그리기 filename = "DQN_Double_%r_Dueling_%r_PER_%r_" % ( self.Double, self.Dueling, self.PER) + datetime.datetime.now().strftime( '%Y-%m-%d %H %M') + '.png' directory = './SaveResult' savepath = os.path.join(directory, filename) plt.figure('%d%d%d' % (self.Double, self.Dueling, self.PER)) plt.scatter(self.NumEpisode, self.AvgSteps) plt.xlabel('num of episode') plt.ylabel('average steps') plt.title('DQN with Double : %r, Dueling : %r, PER : %r' % (self.Double, self.Dueling, self.PER)) plt.grid() plt.savefig(savepath) plt.show() episode_final = True # 다음 에피소드에서 애니메이션을 생성
class PolicyLearner: def __init__(self, load_model=True, learning_rate=0.005, min_trading_unit=0, max_trading_unit=10, delayed_reward_threshold=.01, training=True): self.environment = Environment() self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) self.batch_size = 2 self.update_freq = 4 self.y = .99 self.discount_factor = .8 #0.8**30 = 0.004 self.startE = 1 self.endE = 0.1 self.anneling_steps = 10000. self.num_episodes = 10000 self.pre_train_steps = 200 self.max_epLength = 20 self.replay_memory = 10 self.training_step = 5 self.load_model = load_model self.path = './dqn' # 모델을 세이브할 장소를 만든다. if not os.path.exists(self.path): os.makedirs(self.path) # self.h_size = 512 self.tau = 0.001 tf.reset_default_graph() self.network_type = [20, 25] #, 6, 7] self.buffer_size = 0 for image_type in self.network_type: image_size = 1 for shape in self.environment.RANGE_SHAPE[image_type]: image_size *= shape self.buffer_size += image_size self.buffer_size = ((15 * (1024**3)) // (self.buffer_size * 2 * self.max_epLength)) // 10 * 10 #10GB / Imagesize print(self.buffer_size) self.mainQN = [ Qnetwork(learning_rate=learning_rate, model_type=type, name='main_' + str(type)) for type in self.network_type ] if training: self.targetQN = [ Qnetwork(learning_rate=learning_rate, model_type=type, name='target_' + str(type)) for type in self.network_type ] ''' self.mainQN = [Qnetwork(learning_rate=learning_rate, model_type=5), Qnetwork(learning_rate=learning_rate, model_type=20), Qnetwork(learning_rate=learning_rate, model_type=60)] self.targetQN = [Qnetwork(learning_rate=learning_rate, model_type=5), Qnetwork(learning_rate=learning_rate, model_type=20), Qnetwork(learning_rate=learning_rate, model_type=60)] ''' def train(self): init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=1, reshape=True) trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, self.tau) rList = [] #portfolio_list=[] total_steps = 0 myBuffer = experience_buffer(self.buffer_size) episode_buffer = experience_buffer() e = self.startE stepDrop = (self.startE - self.endE) / self.anneling_steps with tf.Session() as sess: # 변수를 초기화한다. sess.run(init) if self.load_model == True: print('Loading Model...') # 모델을 불러온다 ckpt = tf.train.get_checkpoint_state(self.path) saver.restore(sess, ckpt.model_checkpoint_path) e = self.endE # 주요 신경망과 동일하게 타겟 신경망을 설정한다 updateTarget(targetOps, sess) # 에피소드 시작 for ii in range(self.num_episodes): rAll = 0 d = False j = 0 episode_buffer.buffer = [] episode_reward_buffer = [] self.environment.reset() self.agent.reset() rnn_state = np.array( [mainQN.state_init for mainQN in self.mainQN]) #print('%d 번째 episode 초기화 :' % ii,self.environment.idx, self.environment.KOSPI_idx, 'total num :',total_steps, '종목코드',self.environment.chart_code) s = [ self.environment.get_image(days) for days in self.network_type ] s_potfol = np.array(self.agent.get_states()) episode_step = 1 while j < self.max_epLength and not d: j += 1 #입력값으로 행동선택하기(베이시안 + 볼트만) all_Q_d = np.zeros([self.agent.NUM_ACTIONS]) before_rnn_state = rnn_state[:] for i, mainQN in enumerate(self.mainQN): Q_d, rnn_state[i] = sess.run( [mainQN.Q_dist, mainQN.state_out], feed_dict={ mainQN.inImage: [s[i]], mainQN.portfolio_state: [s_potfol], mainQN.state_in[0]: rnn_state[i][0], mainQN.state_in[1]: rnn_state[i][1], mainQN.temp: e, mainQN.keep_per: (1 - e) + 0.1, mainQN.phase: True }) all_Q_d += Q_d[0] #모든 신경망의 확률값을 더한 뒤 나눔 #print(np.sum(all_Q_d)) all_Q_d /= len(self.network_type) all_Q_d /= np.sum(all_Q_d) #print(np.sum(all_Q_d)) a = np.random.choice(all_Q_d, p=all_Q_d) action = np.argmax(all_Q_d == a) #정책에 행동전달 delayed_reward = self.agent.act(action=action, confidence=all_Q_d[action]) d = self.environment.step() if e > self.endE and total_steps > self.pre_train_steps: e -= stepDrop ''' immediate_reward, delayed_reward = self.agent.act(action=action, confidence=all_Q_d[action]) if e > self.endE and total_steps > self.pre_train_steps: e -= stepDrop #다음 인덱스로 넘어가기 d = self.environment.step() if (delayed_reward == 0 and episode_step % 5 == 0) or d: delayed_reward = immediate_reward self.agent.base_portfolio_value = self.agent.portfolio_value ''' #다음이미지,포폴 받기 #print('total step :', total_steps, 'current episode step : ', j, 'idx :', self.environment.idx, 'kospi_idx', self.environment.KOSPI_idx, '종목코드',self.environment.chart_code) s1 = [ self.environment.get_image(days) for days in self.network_type ] s1_potfol = np.array(self.agent.get_states()) episode_reward_buffer.append(delayed_reward) #버퍼에 저장 # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 # 재수정 버퍼 : 현재이미지 액션 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 보상(디스카운트 펙터설정) #episode_buffer.add([s, action, delayed_reward, s1, s1_potfol, d, before_rnn_state, rnn_state, s_potfol ] ) episode_buffer.add([ s, action, s1, s1_potfol, d, before_rnn_state, rnn_state, s_potfol ]) if total_steps > self.pre_train_steps and total_steps % self.training_step == 0: try: #버퍼에서 데이터 가져오기 # 학습 모드이고 지연 보상이 존재할 경우 정책 신경망 갱신 #원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 # 배치 학습 데이터 크기 trainBatch, size = myBuffer.sample( self.replay_memory, rList) #(self.batch_size) #print('훈련데이터 추출 결과 : ', trainBatch.shape) #보상을 전행동에 영향이 가도록 할인인자로 곱해야함 for i in range(len(self.network_type)): # 아래는 target Q-value를 업데이트하는 Double-DQN을 수행한다 # 주요 신경망에서 행동을 고른다. #학습 시 베이시안과 볼트만을 사용하지 않음 #LSTM 학습을 위해서 랜덤한 에피소드에 랜덤한 날짜부터 replay memory만큼 선정하고 사용함 # 재수정 버퍼 : 현재이미지 액션 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 보상(디스카운트 펙터설정) feed_dict = { self.mainQN[i].inImage: [datas[i] for datas in trainBatch[:, 2]], self.mainQN[i].portfolio_state: [data for data in trainBatch[:, 3]], self.mainQN[i].state_in[0]: trainBatch[0, 6][i][0], self.mainQN[i].state_in[1]: trainBatch[0, 6][i][1], self.mainQN[i].keep_per: 1.0, self.mainQN[i].phase: True } Q1 = sess.run(self.mainQN[i].predict, feed_dict=feed_dict) del feed_dict feed_dict_2 = { self.targetQN[i].inImage: [datas[i] for datas in trainBatch[:, 2]], self.targetQN[i].portfolio_state: [data for data in trainBatch[:, 3]], self.targetQN[i].state_in[0]: trainBatch[0, 6][i][0], self.targetQN[i].state_in[1]: trainBatch[0, 6][i][1], self.targetQN[i].keep_per: 1.0, self.targetQN[i].phase: True } Q2 = sess.run( self.targetQN[i].Qout, # feed_dict 수정해야함 feed_dict=feed_dict_2) del feed_dict_2 ''' Q1 = sess.run(self.mainQN[i].predict, feed_dict={self.mainQN[i].inImage: np.vstack(trainBatch[:, 3])}) # 타겟 신경망에서 Q 값들을 얻는다. Q2 = sess.run(self.targetQN[i].Qout, #feed_dict 수정해야함 feed_dict={self.targetQN[i].inImage: np.vstack(trainBatch[:, 3])}) ''' # 종료 여부에 따라 가짜 라벨을 만들어준다 end_multiplier = -(trainBatch[:, 4] - 1) # 타겟 신경망의 Q 값들 중에 주요 신경망에서 고른 행동 번째의 Q 값들을 가져온다.(이부분이 doubleQ) doubleQ = Q2[range(size), Q1] # 보상에 대한 더블 Q 값을 더해준다. y는 할인 인자 # targetQ 는 즉각적인 보상 + 다음 상태의 최대 보상(doubleQ) targetQ = trainBatch[:, 8] + ( self.y * doubleQ * end_multiplier) # 우리의 타겟 값들과 함께 신경망을 업데이트해준다. # 행동들에 대해서 targetQ 값과의 차이를 통해 손실을 구하고 업데이트 # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 feed_dict = { self.mainQN[i].inImage: [datas[i] for datas in trainBatch[:, 0]], self.mainQN[i].portfolio_state: [data for data in trainBatch[:, 7]], self.mainQN[i].targetQ: targetQ, self.mainQN[i].actions: trainBatch[:, 1], self.mainQN[i].keep_per: 1.0, self.mainQN[i].state_in[0]: trainBatch[0, 5][i][0], self.mainQN[i].state_in[1]: trainBatch[0, 5][i][1], self.mainQN[i].phase: True } _ = sess.run(self.mainQN[i].updateModel, \ feed_dict=feed_dict) del feed_dict ''' _ = sess.run(self.mainQN[i].updateModel, \ feed_dict={self.mainQN[i].inImage: np.vstack(trainBatch[:, 0]), self.mainQN[i].targetQ: targetQ, self.mainQN[i].actions: trainBatch[:, 1]}) ''' updateTarget(targetOps, sess) except IndexError as e: print(trainBatch) rAll += delayed_reward #rAll = delayed_reward # 상태를 바꾼다. del s s = s1 del s_potfol s_potfol = s1_potfol total_steps += 1 episode_step += 1 #portfolio_list.append(self.agent.portfolio_value) #할인인자 적용한 보상을 에피소드 버퍼에 추가 accumulate = 0 episode_reward_buffer.reverse() #print('%s episode_reward_len : ' % ii, len(episode_reward_buffer), 'episode_buffer_len :', len(episode_buffer.buffer)) for i, reward in enumerate(episode_reward_buffer): accumulate = self.discount_factor * accumulate + reward idx = -(i + 1) episode_buffer.buffer[idx] += [accumulate] #print(idx, len(episode_buffer.buffer[idx])) myBuffer.add(episode_buffer.buffer) if len(rList) + 1 >= self.buffer_size: # self.buffer[0:1] = [] del rList[0] rList.append(rAll) self.environment.chartcode_value[ self.environment. chart_code] += 1 if self.agent.portfolio_value > self.agent.initial_balance else -1 print("%d %s %d %d %d %d" % (ii, self.environment.chart_code, rAll, self.agent.portfolio_value, self.agent.minimum_portfolio_value, self.agent.maximum_portfolio_value)) #print("%d %4f %d %4f %4f %d %d"% (total_steps, np.mean(rList[-10:]), np.mean(portfolio_list), np.max(rList[-10:]),np.min(rList[-10:]),np.max(portfolio_list),np.min(portfolio_list)))#e) #print(sys.getsizeof(myBuffer.buffer), sys.getsizeof(episode_buffer.buffer)) #portfolio_list= [] if total_steps > self.pre_train_steps and ii % 50 == 0: try: saver.save(sess, self.path + '/model-' + str(ii) + '.cptk') with open('./value_chart.txt', 'w') as f: data = json.dumps(self.environment.chartcode_value) f.write(data) del data #print("Saved Model") except: pass sleep(2) # 학습 끝 평균 보상을 표시 saver.save(sess, self.path + '/model-' + str(ii) + '.cptk') print("평균 episode 별 보상 값 : " + str(sum(rList) / self.num_episodes)) def test(self): init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, self.tau) rList = [] total_steps = 0 myBuffer = experience_buffer() episode_buffer = experience_buffer() e = self.startE stepDrop = (self.startE - self.endE) / self.anneling_steps with tf.Session() as sess: if self.load_model == True: print('Loading Model...') # 모델을 불러온다 ckpt = tf.train.get_checkpoint_state(self.path) self.saver.restore(sess, ckpt.model_checkpoint_path) # 변수를 초기화한다. sess.run(init) # 주요 신경망과 동일하게 타겟 신경망을 설정한다 updateTarget(targetOps, sess) # 에피소드 시작 for ii in range(self.num_episodes): rAll = 0 d = False j = 0 experience_buffer.buffer = [] self.environment.reset() self.agent.reset() rnn_state = np.array( [mainQN.state_init for mainQN in self.mainQN]) print('%d 번째 episode 초기화 :' % ii, self.environment.idx, self.environment.KOSPI_idx, 'total num :', total_steps, '종목코드', self.environment.chart_code) s = [ self.environment.get_image(days) for days in self.network_type ] s_potfol = np.array(self.agent.get_states()) while j < self.max_epLength and not d: j += 1 # 입력값으로 행동선택하기(베이시안 + 볼트만) all_Q_d = np.zeros([self.agent.NUM_ACTIONS]) before_rnn_state = rnn_state[:] for i, mainQN in enumerate(self.mainQN): Q_d, rnn_state[i] = sess.run( [mainQN.Q_dist, mainQN.state_out], feed_dict={ mainQN.inImage: [s[i]], mainQN.portfolio_state: [s_potfol], mainQN.state_in[0]: rnn_state[i][0], mainQN.state_in[1]: rnn_state[i][1], mainQN.temp: e, mainQN.keep_per: (1 - e) + 0.1, mainQN.phase: True }) all_Q_d += Q_d[0] # 모든 신경망의 확률값을 더한 뒤 나눔 # print(np.sum(all_Q_d)) all_Q_d /= len(self.network_type) all_Q_d[0] += 1 - np.sum(all_Q_d) # print(np.sum(all_Q_d)) a = np.random.choice(all_Q_d, p=all_Q_d) action = np.argmax(all_Q_d == a) # 정책에 행동전달 immediate_reward, delayed_reward = self.agent.act( action=action, confidence=all_Q_d[action]) if e > self.endE and total_steps > self.pre_train_steps: e -= stepDrop if delayed_reward == 0 and total_steps % 5 == 0: delayed_reward = immediate_reward self.agent.base_portfolio_value = self.agent.portfolio_value # 다음 인덱스로 넘어가기 d = self.environment.step() # 다음이미지,포폴 받기 # print('total step :', total_steps, 'current episode step : ', j, 'idx :', self.environment.idx, 'kospi_idx', self.environment.KOSPI_idx, '종목코드',self.environment.chart_code) s1 = [ self.environment.get_image(days) for days in self.network_type ] s1_potfol = np.array(self.agent.get_states()) # 버퍼에 저장 # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 rAll += delayed_reward # 상태를 바꾼다. del s s = s1 del s_potfol s_potfol = s1_potfol total_steps += 1 if total_steps > self.pre_train_steps and ii % 50 == 0: saver.save(sess, self.path + '/model-' + str(ii) + '.cptk') print("Saved Model") sleep(3) rList.append(rAll) myBuffer.add(episode_buffer.buffer) if len(rList) % 10 == 0: print(total_steps, np.mean(rList[-10:]), e) sleep(2) # saver.save(sess, self.path + '/model-' + str(i) + '.cptk') # 평균 보상을 표시 print("평균 episode 별 보상 값 : " + str(sum(rList) / self.num_episodes))
class management: def _init_environment(self, datapath, window_size): df = pd.read_csv(datapath) bid_price_columns = [i for i in range(1, len(df.columns), 20)] print(bid_price_columns) ask_price_columns = [i for i in range(3, len(df.columns), 20)] bidPrices = df[df.columns[bid_price_columns]] askPrices = df[df.columns[bid_price_columns]] df_concat = pd.concat([bidPrices, askPrices]) midPrices = df_concat.groupby( df_concat.index).mean().transpose().values[-len(self.securities):] print(midPrices[:, 0]) self.env = DummyVecEnv( [lambda: securities_trading_env(np.array(midPrices).T)]) self.env = VecCheckNan(self.env, raise_exception=True) n_actions = self.env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) print(n_actions) if (self.policy == "DDPG"): self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise=action_noise) elif (self.policy == "TD3"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) else: self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose)) if self.load: #load model self.model = self.model.load("save/" + modelpath + ".h5") #init model class self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy, strategy, cash_balance, self.model, self.env, window_size, self.inventory) def _init_sec_prices(self, securities): sec_state = dict() for sec in securities: sec_state.setdefault(sec, None) return sec_state def _init_market_dict(self, market_event_securities, market_event_queue): market_dict = dict() for sec in market_event_securities: sym_dict = dict() for e in market_event_queue: sym_dict[e] = None market_dict[sec] = sym_dict return market_dict # size of each security hold is set to be 0 initially def _init_inventory(self, securities): inventory = dict() for sec in securities: inventory[sec] = 0.0 return inventory def __init__(self, market_event_securities, market_event_queue, securities, queue, host, policy, strategy, cash_balance, load, train, train_only, verbose, modelpath, datapath, train_steps, test_steps, window_size, episodes): logging.basicConfig(level=logging.INFO) self.policy = policy self.strategy = strategy self.verbose = verbose self.load = load self.train = train self.modelpath = modelpath self.strategy = strategy # identifier for different clients self.market_event_securities = market_event_securities # strings of securities, e.g. [ZFH0:MBO,ZTH0:MBO,UBH0:MBO,ZNH0:MBO,ZBH0:MBO] self.market_event_queue = market_event_queue # strings of names of prices in market_event_securities, e.g. [L1, L2, L3] self.securities = securities self.num_of_securities = len( self.securities) # number of securities the bot will trade in self.internalID = 0 # internal id for every order the bot wants to send self.steps = 0 # number of trades the bot has made self.cash_balance = cash_balance self.inventory = self._init_inventory( self.securities) # size of each security hold self.inventoryValue = 0.0 self.PnL = self.cash_balance + self.inventoryValue self.outputfile = "save/" + strategy + "_logs.txt" self._init_environment(datapath, window_size) if self.train: #Train model if true for e in range(episodes): logging.info(" Episode : %s" % str(e)) self.gym_model.train_model(train_steps, test_steps) self.model = self.gym_model.model model_save = "save/" + self.modelpath + ".h5" logging.info("Model saved as: " + model_save) with open(self.outputfile, "a") as myfile: myfile.write("Model saved as: %s \n" % model_save) self.model.save(model_save) if train_only: return self.market_dict = self._init_market_dict( self.market_event_securities, self.market_event_queue) # L1-L5 levels data # self.market_dict["ZTH0:MBO"]["L1"] to read l1 data of ZTH0:MBO self.ask_trend = self._init_market_dict(self.market_event_securities, self.market_event_queue) # if self.market_dict["ZTH0:MBO"]["L1"]["L1AskPrice"] goes up, self.ask_trend["ZTH0:MBO"]["L1"] = 1 # if self.market_dict["ZTH0:MBO"]["L1"]["L1AskPrice"] goes down, self.ask_trend["ZTH0:MBO"]["L1"] = -1 # if self.market_dict["ZTH0:MBO"]["L1"]["L1AskPrice"] stays the same, self.ask_trend["ZTH0:MBO"]["L1"] = 0 self.bid_trend = self._init_market_dict(self.market_event_securities, self.market_event_queue) # if self.market_dict["ZTH0:MBO"]["L1"]["L1BidPrice"] goes up, self.bid_trend["ZTH0:MBO"]["L1"] = 1 # if self.market_dict["ZTH0:MBO"]["L1"]["L1BidPrice"] goes down, self.bid_trend["ZTH0:MBO"]["L1"] = -1 # if self.market_dict["ZTH0:MBO"]["L1"]["L1BidPrice"] stays the same, self.bid_trend["ZTH0:MBO"]["L1"] = 0 self.mid_market = self._init_sec_prices( securities ) # half of the sum of current L1 ask price and L1 bid price self.exIds_to_inIds = dict( ) # when your order is acked, the bot will receive an external id for it. map exid to inid here. self.inIds_to_orders_sent = dict() # orders sent but not acked self.inIds_to_orders_confirmed = dict( ) # orders confirmed by matching agent self.talk = Communication(market_event_securities, market_event_queue, securities, queue, host, callback_for_levels=self.callback_for_levels, callback_for_acks=self.callback_for_acks, callback_for_trades=self.callback_for_trades) self.talk.kickoff() def _save_order_being_sent(self, order): self.inIds_to_orders_sent[order["orderNo"]] = order def cancel_order(self, order): self.talk._cancel_order(order) def send_order(self, order): if order["side"] == 'B' and self.PnL < order["price"] * order[ "origQty"]: logging.warning("portfolio : " + str(self.PnL)) logging.warning("Not enough portfolio to buy " + str(order["origQty"]) + " " + order["symb"]) return False elif order["side"] == 'S' and self.inventory[ order["symb"]] < order["origQty"]: logging.warning(order["symb"] + " : " + str(self.inventory[order["symb"]])) logging.warning("Not enough " + order["symb"] + " to sell") return False else: order["orderNo"] = self.internalID self._save_order_being_sent(order) logging.info("\n Order %s is sent" % str(order["orderNo"])) self.internalID += 1 self.talk._send_order(order) return True def _update_with_trade(self, tradeobj, side, exId): # buy side = 1, sell side = -1 self._update_inventory(tradeobj.symbol, tradeobj.tradeSize * side) self._update_inventory_value() self._update_cash(tradeobj.tradeSize, tradeobj.tradePrice * (-side)) self._update_pnl() self._update_order_remain(exId, tradeobj.tradeSize) logging.info(" [X] Cash : %s" % str(self.cash_balance)) logging.info(" [X] Inventory Value : %s" % str(self.inventoryValue)) logging.info(" [X] Portfolio Value : %s" % str(self.PnL)) with open(self.outputfile, "a") as myfile: myfile.write(" [X] Cash : %s\n" % str(self.cash_balance)) myfile.write(" [X] Inventory Value : %s\n" % str(self.inventoryValue)) myfile.write(" [X] Portfolio Value : %s\n" % str(self.PnL)) def _update_inventory(self, symbol, size): self.inventory[symbol] += size logging.debug(" [X] inventory:") with open(self.outputfile, "a") as myfile: for sec in self.securities: logging.info("%s : %d" % (sec, self.inventory[sec])) myfile.write("%s : %d\n" % (sec, self.inventory[sec])) def _update_inventory_value(self, ): inventoryValue = 0.0 for sec in self.securities: if self.mid_market[sec] is not None: inventoryValue += self.inventory[sec] * self.mid_market[sec] self.inventoryValue = inventoryValue logging.debug(" [X] inventory value: %d" % self.inventoryValue) for sec in self.securities: logging.debug("%s : %d" % (sec, self.inventory[sec])) def _update_cash(self, size, price): self.cash_balance += size * price logging.debug(" [X] cash balance: %d" % self.cash_balance) def _update_pnl(self, ): self.PnL = self.cash_balance + self.inventoryValue logging.debug(" [X] portfolio value: %d" % self.PnL) def _update_order_remain(self, exId, size): inId = self.exIds_to_inIds[exId] self.inIds_to_orders_confirmed[inId]["remainingQty"] -= size if self.inIds_to_orders_confirmed[inId]["remainingQty"] == 0: self.inIds_to_orders_confirmed.pop(inId) # only accept trade which belongs to this bot def _condition_to_accept_trade(self, tradeobj): exId = 0 if tradeobj.buyOrderNo in list(self.exIds_to_inIds.keys()): print(self.exIds_to_inIds) with open(self.outputfile, "a") as myfile: myfile.write( "Order %s : Buy Order %d is filled with quantity %d of price %s\n" % (str(tradeobj.buyOrderNo), self.exIds_to_inIds[tradeobj.buyOrderNo], tradeobj.tradeSize, tradeobj.tradePrice)) logging.info( "Order %s : Buy Order %d is filled with quantity %d of price %s\n" % (str(tradeobj.buyOrderNo), self.exIds_to_inIds[tradeobj.buyOrderNo], tradeobj.tradeSize, tradeobj.tradePrice)) return tradeobj.buyOrderNo, 1 elif tradeobj.sellOrderNo in list(self.exIds_to_inIds.keys()): print(self.exIds_to_inIds) logging.info( "Order %s : Sell Order %d is filled with quantity %d of price %s\n" % (str(tradeobj.sellOrderNo), self.exIds_to_inIds[tradeobj.sellOrderNo], tradeobj.tradeSize, tradeobj.tradePrice)) with open(self.outputfile, "a") as myfile: myfile.write( "Order %s : Sell Order %d is filled with quantity %d of price %s\n" % (str(tradeobj.sellOrderNo), self.exIds_to_inIds[tradeobj.sellOrderNo], tradeobj.tradeSize, tradeobj.tradePrice)) return tradeobj.sellOrderNo, -1 else: return exId, 0 def callback_for_trades(self, tradeobj): exId, side = self._condition_to_accept_trade(tradeobj) if side == -1 or side == 1: # uodate inventory, pnl, manage orders, decrease reamaining qty, if reamaining qty is 0, remove it from orders_confirmed self._update_with_trade(tradeobj, side, exId) self.steps = self.steps + 1 self.gym_model.model_reaction_to_trade(tradeobj) def _update_with_ack(self, aMobj): inId = aMobj.internalOrderNo exId = aMobj.orderNo if aMobj.action == "A" and (inId in self.inIds_to_orders_sent): self.inIds_to_orders_confirmed[ inId] = self.inIds_to_orders_sent.pop(inId) self.exIds_to_inIds[exId] = inId logging.info("ExId: %s -> InId: %s" % (exId, inId)) elif aMobj.action == "D" and (inId in self.inIds_to_orders_confirmed): self.inIds_to_orders_sent[ inId] = self.inIds_to_orders_confirmed.pop(inId) self.exIds_to_inIds[exId] = inId logging.info("ExId: %s -> InId: %s" % (exId, inId)) #record orders which are not successfully sent or canceled in case you want to send them again and map exid to inid def callback_for_acks(self, aMobj): if (aMobj.strategy == self.strategy): self._update_with_ack(aMobj) self.gym_model.model_reaction_to_ack(aMobj) def _update_trend(self, trend, symbol, lv, oldprice, newprice): if newprice > oldprice: trend[symbol][lv] = 1 elif newprice < oldprice: trend[symbol][lv] = -1 else: trend[symbol][lv] = 0 def _update_market_dict(self, tob): sym = tob["symb"] for lv in self.market_event_queue: if tob[lv + "AskPrice"] is not None and tob[lv + "BidPrice"] is not None: if self.market_dict[sym][lv] is not None: self._update_trend( self.bid_trend, sym, lv, oldprice=self.market_dict[sym][lv][lv + "BidPrice"], newprice=tob[lv + "BidPrice"]) self._update_trend( self.ask_trend, sym, lv, oldprice=self.market_dict[sym][lv][lv + "AskPrice"], newprice=tob[lv + "AskPrice"]) self.market_dict[sym][lv] = { lv + "AskPrice": tob[lv + "AskPrice"], lv + "BidPrice": tob[lv + "BidPrice"], lv + "AskSize": tob[lv + "AskSize"], lv + "BidSize": tob[lv + "BidSize"] } self.mid_market[sym] = 0.5 * ( self.market_dict[sym]["L1"]["L1AskPrice"] + self.market_dict[sym]["L1"]["L1BidPrice"]) #if (sym == "ZBH0:MBO"): # print("\n"+sym + ": " +str( self.mid_market[sym])) #print(self.mid_market) # self._update_inventory_value() # self._update_pnl() # should be called when new level data arrives def callback_for_levels(self, tob): self._update_market_dict(tob) if tob["symb"] in self.securities: self._update_inventory_value() self._update_pnl() observation = np.array([v for v in self.mid_market.values()]) orders = self.gym_model.model_reaction_to_level( observation, self.inventory) for order in orders: self.send_order(order)
def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dqn', num_steps=5, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code self.chart_data = chart_data self.environment = Environment(chart_data) # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network self.policy_network = policy_network self.reuse_models = reuse_models # 가시화 모듈 self.visualizer = Visualizer() # 메모리 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 # 로그 등 출력 경로 self.output_path = output_path
def train(env_name, print_things=True, train_run_id=0, train_episodes=5000): # Create a Gym environment env = gym.make(env_name) # Get dimensionalities of actions and observations action_space_dim = env.action_space.shape[-1] observation_space_dim = env.observation_space.shape[-1] # Instantiate agent and its policy policy = Policy(observation_space_dim, action_space_dim) agent = Agent(policy) # Arrays to keep track of rewards reward_history, timestep_history = [], [] average_reward_history = [] start = time.time() # Run actual training for episode_number in range(train_episodes): reward_sum, timesteps = 0, 0 done = False # Reset the environment and observe the initial state observation = env.reset() # Loop until the episode is over while not done: # Get action from the agent action, action_probabilities = agent.get_action(observation) previous_observation = observation # Perform the action on the environment, get new state and reward observation, reward, done, info = env.step(action.detach().numpy()) # Store action's outcome (so that the agent can improve its policy) agent.store_outcome(previous_observation, action_probabilities, action, reward) # Store total episode reward reward_sum += reward timesteps += 1 if print_things: print("Episode {} finished. Total reward: {:.3g} ({} timesteps)". format(episode_number, reward_sum, timesteps)) # Bookkeeping (mainly for generating plots) reward_history.append(reward_sum) timestep_history.append(timesteps) if episode_number > 100: avg = np.mean(reward_history[-100:]) else: avg = np.mean(reward_history) average_reward_history.append(avg) # Let the agent do its magic (update the policy) agent.episode_finished(episode_number) # Training is finished - plot rewards if print_things: plt.plot(reward_history) plt.plot(average_reward_history) plt.legend(["Reward", "100-episode average"]) plt.title("Reward history") plt.show() print("Training finished.") end = time.time() print("Running time: {:.04f}".format((end - start))) data = pd.DataFrame({ "episode": np.arange(len(reward_history)), "train_run_id": [train_run_id] * len(reward_history), # TODO: Change algorithm name for plots, if you want "algorithm": ["PG"] * len(reward_history), "reward": reward_history }) torch.save(agent.policy.state_dict(), "model_%s_%d.mdl" % (env_name, train_run_id)) return data
from agent import Agent from problem import Problem problem = Problem() #initial_state = [1,2,3,8,4,0,7,6,5] # really easy problem (1 step) #initial_state = [1,2,3,8,6,0,7,5,4] # easy problem (3 steps) #initial_state = [1,2,3,8,4,7,0,6,5] # mid problem (12 steps) #initial_state = [1,3,2,4,0,7,8,6,5] initial_state = [5, 2, 8, 4, 1, 7, 0, 3, 6] # hard problem (22 steps) with initial state # as [1,2,3,4,5,6,7,8,0] agent = Agent(initial_state) print('Estado atual:') agent.print_state(initial_state) agent.do_action(problem) print('Resolução:') while agent.solve_stack: agent.do_action(problem) print('End')
def train_and_test_agent(environment_name='CarRacing-v0', num_episodes=100, max_test_length=5, target_update_freq=1, initial_test_freq=2, replay_buffer_size=5e4, training_batch_size=32, learning_rate=0.001, rewards_threshold=900, action_step_length=3): # Set up environment and agent env = gym.make(environment_name) agent = Agent(env, replay_buffer_size=replay_buffer_size, learning_rate=learning_rate, training_batch_size=training_batch_size, num_actions=(5, 4, 4)) # Set up logging meta_data = ({ 'env': environment_name, 'target_update_freq': target_update_freq, 'replay_buffer_size': replay_buffer_size, 'training_batch_size': training_batch_size, 'learning_rate': learning_rate }) vars_to_track = ('episode', 'testing', 'epsilon', 'step_count', 'rewards') logger = GymLogger(meta_data, vars_to_track) # Start by filling replay buffer using random actions while len(agent.memory.replay_buffer) < 100: env.reset() state = None done = False while not done: action_num = np.random.choice(agent.num_actions) action = agent.action_space[action_num, :] next_state, reward, done, took_all_steps = take_steps( env, action, action_step_length) if took_all_steps and state is not None: if state.shape[2] == 9 and next_state.shape[2] == 9: # Fix this agent.memory.update( (state, action_num, reward, next_state, done)) state = next_state # Use testing flag to determine action selection below testing = False test_rewards = deque() best_test_rewards = 0 for episode in range(num_episodes): # Throw away first few frames when camera is zooming in env.reset() for _ in range(10): state, _, _, _ = take_steps(env, (0, 1, 0), action_step_length) done = False episode_reward = 0 step_count = 0 while not done: if not testing: action_num, action = agent.determine_action(state) else: action_num, action = agent.act(state) next_state, reward, done, took_all_steps = take_steps( env, action, action_step_length) episode_reward += reward if took_all_steps and state is not None: if state.shape[2] == 9 and next_state.shape[2] == 9: # Fix this agent.memory.update( (state, action_num, reward, next_state, done)) agent.learner.train( agent.memory.random_sample( num_samples=training_batch_size)) state = next_state step_count += 1 if done: logger.update((episode, testing, agent.epsilon, step_count, episode_reward)) if testing: test_rewards.append(episode_reward) mean_test_rewards = np.mean(test_rewards) print('episode: ' + str(episode)) print('step count: ' + str(step_count)) print('learning rate: ' + str(learning_rate)) print('epsilon: ' + str(agent.epsilon)) print('mean test rewards: ' + str(mean_test_rewards)) print('test episodes: ' + str(len(test_rewards))) print('\n') if mean_test_rewards > best_test_rewards: print('New best test result') logger.save_model_weights( agent.learner.target_model.get_weights(), 'best') best_test_rewards = mean_test_rewards if mean_test_rewards < rewards_threshold: testing = False test_rewards.clear() elif len(test_rewards) == max_test_length: print('Success!') logger.save_model_weights( agent.learner.target_model.get_weights(), episode) break if (episode >= target_update_freq) and (episode % target_update_freq == 0): agent.learner.update_target() if (episode > initial_test_freq) and (episode % initial_test_freq) == 0: testing = True agent.udpate_epsilon() logger.save_history() env.close()
# number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.visual_observations[0] print('States look like:', state) print('States have shape:', state.shape) state = process_observation(state, device) # load the weights from file agent = Agent(input_shape=state.shape[1:], action_size=action_size, seed=0) agent.qnetwork_local.load_state_dict( torch.load('../checkpoints/dueling_checkpoint.pth')) score = 0 # initialize the score for i in range(3): env_info = env.reset(train_mode=False)[brain_name] state = env_info.visual_observations[0] # get the current state state = process_observation(state, device) for j in range(2000): action = agent.act(state) env_info = env.step(action)[brain_name]
def __init__(self): Agent.__init__(self)
LR_CRITIC = 1e-3 # learning rate of the critic SEED = 0 TAU = 6e-2 # for soft update of target parameters WEIGHT_DECAY = 0 # L2 weight decay UPDATE_EVERY = 1 # time steps between network updates #N_UPDATES = 1 # number of times training ADD_NOISE = True #eps_start = 6 # Noise level start #eps_end = 0 # Noise level end #eps_decay = 250 # Number of episodes to decay over from start to end device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("DEVICE:{}".format(device)) """ Setup two independent agents with shared experience memory """ agent_0 = Agent(state_size, action_size, num_agents=1, seed=SEED) agent_1 = Agent(state_size, action_size, num_agents=1, seed=SEED) n_episodes = 1000 scores_window = deque(maxlen=100) scores_all = [] rolling_average = [] elapsed_time_list = [] list1 = [] list2 = [] list3 = [] for i_episode in range(0, n_episodes): # Start the clock start_time = time.time()
def __init__(self, load_model=True, learning_rate=0.005, min_trading_unit=0, max_trading_unit=10, delayed_reward_threshold=.01, training=True): self.environment = Environment() self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) self.batch_size = 2 self.update_freq = 4 self.y = .99 self.discount_factor = .8 #0.8**30 = 0.004 self.startE = 1 self.endE = 0.1 self.anneling_steps = 10000. self.num_episodes = 10000 self.pre_train_steps = 200 self.max_epLength = 20 self.replay_memory = 10 self.training_step = 5 self.load_model = load_model self.path = './dqn' # 모델을 세이브할 장소를 만든다. if not os.path.exists(self.path): os.makedirs(self.path) # self.h_size = 512 self.tau = 0.001 tf.reset_default_graph() self.network_type = [20, 25] #, 6, 7] self.buffer_size = 0 for image_type in self.network_type: image_size = 1 for shape in self.environment.RANGE_SHAPE[image_type]: image_size *= shape self.buffer_size += image_size self.buffer_size = ((15 * (1024**3)) // (self.buffer_size * 2 * self.max_epLength)) // 10 * 10 #10GB / Imagesize print(self.buffer_size) self.mainQN = [ Qnetwork(learning_rate=learning_rate, model_type=type, name='main_' + str(type)) for type in self.network_type ] if training: self.targetQN = [ Qnetwork(learning_rate=learning_rate, model_type=type, name='target_' + str(type)) for type in self.network_type ] '''
import gym import torch import numpy as np from collections import deque import matplotlib.pyplot as plt from agent import Agent env = gym.make('CartPole-v1') agent = Agent(env.observation_space.shape[0], env.action_space.n, seed=0, batch_size=32, q_size=25) env.seed(0) def DQN(num_episodes = 7500, max_iteration = 1000, init_epsilon = 1.0, min_epsilon = 0.05, decay = 0.999): ''' :param num_episodes: :param max_iteration: :param init_epsilon: :param min_epsilon: :param decay: :return: ''' total_reward = [] total_reward_window = deque(maxlen=100) epsilon = init_epsilon
def agent(): from agent import Agent agent = Agent(action_space=[0, 1, 2, 3, 4]) return agent
class ReinforcementLearner: __metaclass__ = abc.ABCMeta lock = threading.Lock() def __init__(self, rl_method='rl', stock_code=None, chart_data=None, training_data=None, min_trading_unit=1, max_trading_unit=2, delayed_reward_threshold=.05, net='dqn', num_steps=5, lr=0.001, value_network=None, policy_network=None, output_path='', reuse_models=True): # 인자 확인 assert min_trading_unit > 0 assert max_trading_unit > 0 assert max_trading_unit >= min_trading_unit assert num_steps > 0 assert lr > 0 # 강화학습 기법 설정 self.rl_method = rl_method # 환경 설정 self.stock_code = stock_code self.chart_data = chart_data self.environment = Environment(chart_data) # 에이전트 설정 self.agent = Agent(self.environment, min_trading_unit=min_trading_unit, max_trading_unit=max_trading_unit, delayed_reward_threshold=delayed_reward_threshold) # 학습 데이터 self.training_data = training_data self.sample = None self.training_data_idx = -1 # 벡터 크기 = 학습 데이터 벡터 크기 + 에이전트 상태 크기 self.num_features = self.agent.STATE_DIM if self.training_data is not None: self.num_features += self.training_data.shape[1] # 신경망 설정 self.net = net self.num_steps = num_steps self.lr = lr self.value_network = value_network self.policy_network = policy_network self.reuse_models = reuse_models # 가시화 모듈 self.visualizer = Visualizer() # 메모리 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 # 로그 등 출력 경로 self.output_path = output_path def init_value_network(self, shared_network=None, activation='linear', loss='mse'): if self.net == 'lstm': self.value_network = LSTMNetwork(input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.value_network_path): self.value_network.load_model(model_path=self.value_network_path) def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'): if self.net == 'lstm': self.policy_network = LSTMNetwork( input_dim=self.num_features, output_dim=self.agent.NUM_ACTIONS, lr=self.lr, num_steps=self.num_steps, shared_network=shared_network, activation=activation, loss=loss) if self.reuse_models and \ os.path.exists(self.policy_network_path): self.policy_network.load_model(model_path=self.policy_network_path) def reset(self): self.sample = None self.training_data_idx = -1 # 환경 초기화 self.environment.reset() # 에이전트 초기화 self.agent.reset() # 가시화 초기화 self.visualizer.clear([0, len(self.chart_data)]) # 메모리 초기화 self.memory_sample = [] self.memory_action = [] self.memory_reward = [] self.memory_value = [] self.memory_policy = [] self.memory_pv = [] self.memory_num_stocks = [] self.memory_exp_idx = [] self.memory_learning_idx = [] # 에포크 관련 정보 초기화 self.loss = 0. self.itr_cnt = 0 self.exploration_cnt = 0 self.batch_size = 0 self.learning_cnt = 0 def build_sample(self): self.environment.observe() if len(self.training_data) > self.training_data_idx + 1: self.training_data_idx += 1 self.sample = self.training_data.iloc[ self.training_data_idx].tolist() self.sample.extend(self.agent.get_states()) return self.sample return None @abc.abstractmethod def get_batch(self, batch_size, delayed_reward, discount_factor): pass def update_networks(self, batch_size, delayed_reward, discount_factor): # 배치 학습 데이터 생성 x, y_value, y_policy = self.get_batch(batch_size, delayed_reward, discount_factor) if len(x) > 0: loss = 0 if y_value is not None: # 가치 신경망 갱신 loss += self.value_network.train_on_batch(x, y_value) if y_policy is not None: # 정책 신경망 갱신 loss += self.policy_network.train_on_batch(x, y_policy) return loss return None def fit(self, delayed_reward, discount_factor, full=False): batch_size = len(self.memory_reward) if full \ else self.batch_size # 배치 학습 데이터 생성 및 신경망 갱신 if batch_size > 0: _loss = self.update_networks(batch_size, delayed_reward, discount_factor) if _loss is not None: self.loss += abs(_loss) self.learning_cnt += 1 self.memory_learning_idx.append(self.training_data_idx) self.batch_size = 0 def visualize(self, epoch_str, num_epoches, epsilon): self.memory_action = [Agent.ACTION_HOLD] \ * (self.num_steps - 1) + self.memory_action self.memory_num_stocks = [0] * (self.num_steps - 1) \ + self.memory_num_stocks if self.value_network is not None: self.memory_value = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_value if self.policy_network is not None: self.memory_policy = [np.array([np.nan] \ * len(Agent.ACTIONS))] * (self.num_steps - 1) \ + self.memory_policy self.memory_pv = [self.agent.initial_balance] \ * (self.num_steps - 1) + self.memory_pv self.visualizer.plot( epoch_str=epoch_str, num_epoches=num_epoches, epsilon=epsilon, action_list=Agent.ACTIONS, actions=self.memory_action, num_stocks=self.memory_num_stocks, outvals_value=self.memory_value, outvals_policy=self.memory_policy, exps=self.memory_exp_idx, learning_idxes=self.memory_learning_idx, initial_balance=self.agent.initial_balance, pvs=self.memory_pv, ) self.visualizer.save( os.path.join(self.epoch_summary_dir, 'epoch_summary_{}.png'.format(epoch_str))) def run(self, num_epoches=100, balance=10000000, discount_factor=0.9, start_epsilon=0.5, learning=True): info = "[{code}] RL:{rl} Net:{net} LR:{lr} " \ "DF:{discount_factor} TU:[{min_trading_unit}," \ "{max_trading_unit}] DRT:{delayed_reward_threshold}".format( code=self.stock_code, rl=self.rl_method, net=self.net, lr=self.lr, discount_factor=discount_factor, min_trading_unit=self.agent.min_trading_unit, max_trading_unit=self.agent.max_trading_unit, delayed_reward_threshold=self.agent.delayed_reward_threshold ) with self.lock: logging.info(info) # 시작 시간 time_start = time.time() # 가시화 준비 # 차트 데이터는 변하지 않으므로 미리 가시화 self.visualizer.prepare(self.environment.chart_data, info) # 가시화 결과 저장할 폴더 준비 self.epoch_summary_dir = os.path.join( self.output_path, 'epoch_summary_{}'.format(self.stock_code)) if not os.path.isdir(self.epoch_summary_dir): os.makedirs(self.epoch_summary_dir) else: for f in os.listdir(self.epoch_summary_dir): os.remove(os.path.join(self.epoch_summary_dir, f)) # 에이전트 초기 자본금 설정 self.agent.set_balance(balance) # 학습에 대한 정보 초기화 max_portfolio_value = 0 epoch_win_cnt = 0 # 학습 반복 for epoch in range(num_epoches): time_start_epoch = time.time() # step 샘플을 만들기 위한 큐 q_sample = collections.deque(maxlen=self.num_steps) # 환경, 에이전트, 신경망, 가시화, 메모리 초기화 self.reset() # 학습을 진행할 수록 탐험 비율 감소 if learning: epsilon = start_epsilon \ * (1. - float(epoch) / (num_epoches - 1)) self.agent.reset_exploration() else: epsilon = start_epsilon while True: # 샘플 생성 next_sample = self.build_sample() if next_sample is None: break # num_steps만큼 샘플 저장 q_sample.append(next_sample) if len(q_sample) < self.num_steps: continue # 가치, 정책 신경망 예측 pred_value = None pred_policy = None if self.value_network is not None: pred_value = self.value_network.predict(list(q_sample)) if self.policy_network is not None: pred_policy = self.policy_network.predict(list(q_sample)) # 신경망 또는 탐험에 의한 행동 결정 action, confidence, exploration = \ self.agent.decide_action( pred_value, pred_policy, epsilon) # 결정한 행동을 수행하고 즉시 보상과 지연 보상 획득 immediate_reward, delayed_reward = \ self.agent.act(action, confidence) # 행동 및 행동에 대한 결과를 기억 self.memory_sample.append(list(q_sample)) self.memory_action.append(action) self.memory_reward.append(immediate_reward) if self.value_network is not None: self.memory_value.append(pred_value) if self.policy_network is not None: self.memory_policy.append(pred_policy) self.memory_pv.append(self.agent.portfolio_value) self.memory_num_stocks.append(self.agent.num_stocks) if exploration: self.memory_exp_idx.append(self.training_data_idx) # 반복에 대한 정보 갱신 self.batch_size += 1 self.itr_cnt += 1 self.exploration_cnt += 1 if exploration else 0 # 지연 보상 발생된 경우 미니 배치 학습 if learning and (delayed_reward != 0): self.fit(delayed_reward, discount_factor) # 에포크 종료 후 학습 if learning: self.fit(self.agent.profitloss, discount_factor, full=True) # 에포크 관련 정보 로그 기록 num_epoches_digit = len(str(num_epoches)) epoch_str = str(epoch + 1).rjust(num_epoches_digit, '0') time_end_epoch = time.time() elapsed_time_epoch = time_end_epoch - time_start_epoch if self.learning_cnt > 0: self.loss /= self.learning_cnt logging.info("[{}][Epoch {}/{}] Epsilon:{:.4f} " "#Expl.:{}/{} #Buy:{} #Sell:{} #Hold:{} " "#Stocks:{} PV:{:,.0f} " "LC:{} Loss:{:.6f} ET:{:.4f}".format( self.stock_code, epoch_str, num_epoches, epsilon, self.exploration_cnt, self.itr_cnt, self.agent.num_buy, self.agent.num_sell, self.agent.num_hold, self.agent.num_stocks, self.agent.portfolio_value, self.learning_cnt, self.loss, elapsed_time_epoch)) # 에포크 관련 정보 가시화 self.visualize(epoch_str, num_epoches, epsilon) # 학습 관련 정보 갱신 max_portfolio_value = max(max_portfolio_value, self.agent.portfolio_value) if self.agent.portfolio_value > self.agent.initial_balance: epoch_win_cnt += 1 # 종료 시간 time_end = time.time() elapsed_time = time_end - time_start # 학습 관련 정보 로그 기록 with self.lock: logging.info("[{code}] Elapsed Time:{elapsed_time:.4f} " "Max PV:{max_pv:,.0f} #Win:{cnt_win}".format( code=self.stock_code, elapsed_time=elapsed_time, max_pv=max_portfolio_value, cnt_win=epoch_win_cnt)) return self.memory_pv def save_models(self): if self.value_network is not None and \ self.value_network_path is not None: self.value_network.save_model(self.value_network_path) if self.policy_network is not None and \ self.policy_network_path is not None: self.policy_network.save_model(self.policy_network_path)
def one_player(): board = Board() agent = Agent(board, PLAYER_X, exploration_rate=0) agent.q_values = pickle.load(open("model/tic-tac-toe-agent-x-epochs-5000.pickle", "rb")) game = Game() game.one_player(board, agent)
# create a pyglet window and set glOptions win = window.Window(width=500, height=500, vsync=True, resizable=True) glEnable(GL_BLEND) glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) # needed so that egi knows where to draw egi.InitWithPyglet(win) # prep the fps display fps_display = clock.ClockDisplay() # register key and mouse event handlers win.push_handlers(on_key_press) win.push_handlers(on_mouse_press) win.push_handlers(on_resize) # create a world for agents world = World(500, 500) # add one agent world.agents.append(Agent(world)) # unpause the world ready for movement world.paused = False while not win.has_exit: win.dispatch_events() glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) # show nice FPS bottom right (default) delta = clock.tick() world.update(delta) world.render() fps_display.draw() # swap the double buffer win.flip()