class TrainedPlayer(AbstractPlayer): def __init__(self, env, number, file_name): super().__init__() self.env = env self.number = number self.action = 0 self.nb_actions = spaces.Discrete(len(HAND)).n # build model. self.model = Sequential() self.model.add( Flatten(input_shape=(1, ) + spaces.Box(low=0, high=2, shape=((PLAYER_NUM + 1) * 2, ROUND_NUM), dtype='float32').shape)) self.model.add(Dense(256)) self.model.add(Activation('relu')) self.model.add(Dense(256)) self.model.add(Activation('relu')) self.model.add(Dense(self.nb_actions)) self.model.add(Activation('linear')) # configure agent. memory = SequentialMemory(limit=50000, window_length=1) policy = EpsGreedyQPolicy() self.dqn = DQNAgent(model=self.model, nb_actions=self.nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy) self.dqn.compile(Adam(learning_rate=1e-3), metrics=[]) self.dqn.load_weights(file_name) print('モデル読み込み……完了') def play(self): self.action = self.dqn.forward(self.env.get_observation(self.number)) # print("Trained AI Action:{}".format( # self.dqn.compute_q_values([self.env.get_observation(self.number)]) # )) # 不可能なアクションを選択された場合 if self.used[self.action] == 1: # 一番近くの高いものに選択しなおす for i in range(self.action, len(HAND)): if self.used[i] == 0.: self.action = i break # それでもダメな場合 if self.used[self.action] == 1: # 一番近くの低いものに選択しなおす for i in range(self.action, -1, -1): if self.used[i] == 0.: self.action = i break self.used[self.action] = 1 return self.action
agent.load_weights('{p}/dqn_{fn}_weights.h5f'.format(p=PATH, fn=ENV_NAME)) ## Train or evaluate if TRAIN: agent.training = True observation = market.reset() while True: try: # TODO add callbacks? ## Agent vybiraet dejstvie # (candles=9(mb=>(2,4)?), tickers=4, trades=2) # TODO actions for multy symbols market action = agent.forward(observation) ## Execute action observation, reward, done, info = market.step([action]) ## Poluchaem otvet ot sredy agent.backward(reward, terminal=done) ## Esli dostigli konca if done: observation = market.reset() agent.reset_states() done = False log.info('Is terminal state. Reset..') log.info('='*40)
# Verify directories if not bootROM is None and not os.path.exists(bootROM): print ("Boot-ROM not found. Please copy the Boot-ROM to '%s'. Using replacement in the meanwhile..." % bootROM) bootROM = None try: filename = "../ROMs/Pokemon Red.gb" # Start PyBoy and run loop pyboy = PyBoy(Window(scale=scale), filename, bootROM) step = 0 while not pyboy.tick(): try: # ((160,144) * scale)-sized black/white array screen_array = pyboy.getScreenBuffer().getScreenBuffer() # print screen_array.shape observation = dqn.processor.process_observation(screen_array) action = dqn.forward(observation) pyboy.sendInput(actions[action]) except Exception as e: print e pass pyboy.stop() except KeyboardInterrupt: print ("Interrupted by keyboard") pyboy.stop() except Exception as ex: traceback.print_exc()
dqn.compile(Adam(lr=0.001), metrics=['mae']) if args.train: # dqn.load_weights('dqn_traffic_weights.h5f') history = dqn.fit(env, nb_steps=100_000, visualize=False, verbose=2) env.close() dqn.save_weights('dqn_traffic_weights.h5f', overwrite=True) pandas.DataFrame(history.history['episode_reward']).plot(figsize=(16, 5)) plt.savefig('output.png') if args.test: dqn.load_weights(WEIGHTS_PATH) done = False step = 0 obs = env.reset() while not done: action = dqn.forward(obs) obs, reward, done, _ = env.step(action) # print(env.get_trafficlight_phase('cp')) # env.get_road_info(road_id) print(f'Action: {action}') print(f'Observation: {obs}') print(f'Reward: {reward}') print(f'Done: {done}') print(f'Max Occupancy: {env.get_max_occupancy()}') step += 1 env.close()
class CenteringDqn: def __init__(self, **args): height, width = getsize() self.params = params self.params['width'] = width self.params['height'] = height self.params['num_training'] = args['numTraining'] self.params['load_file'] = params['load_file'] self._build_dqn_agent(self.params) if args['numTraining'] > 0: self._dqn.training = True else: self._dqn.training = False self.img = None self.zone = 0 self.reward = 0 self._dqn_action = None self.terminal = None self.accum_reward = 0 self._train() def _build_dqn_agent(self, params): NB_ACTIONS = 7 # ---------------------------------------------------------------------------------------------------------------- inputShape = (params['width'], params['height'], 3) model = Sequential() model.add( Conv2D(16, (3, 3), input_shape=inputShape, padding='same', activation='relu')) model.add(Conv2D(32, (3, 3), padding='same', activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2), padding='same')) model.add(NoisyNetDense(16, activation='linear')) model.add(Flatten()) model.add(NoisyNetDense(NB_ACTIONS, activation='linear')) model.summary() # ---------------------------------------------------------------------------------------------------------------- # Memory replay if not params['prio_memory']: print("Using Sequential memory") memory = SequentialMemory(limit=params['mem_size'], window_length=1) else: print("Using Prioritized memory") params['lr'] = params['lr'] / 4 memory = PrioritizedMemory(limit=params['mem_size'], alpha=0.6, start_beta=0.5, end_beta=1.0, steps_annealed=params['annealing'], window_length=1) # Epsilon Greedy policy, linearly decreasing if not params['noisy_layer']: print("Using Annealed Eps Greedy policy") self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=params['eps'], value_min=params['eps_final'], value_test=0.0, nb_steps=params['annealing']) # Or Greedy policy in case of noisy layers else: print("Using Q Greedy policy (with noisy layer)") self.policy = GreedyQPolicy() # Keras DQN agent self._dqn = DQNAgent( model=model, nb_actions=NB_ACTIONS, policy=self.policy, memory=memory, batch_size=params['batch_size'], processor=WindowProcessor(), enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=params['train_start'], gamma=params['discount'], target_model_update=1000, train_interval=1, delta_clip=1., custom_model_objects={"NoisyNetDense": NoisyNetDense}) self._dqn.compile(Adam(lr=params['lr']), metrics=['mae']) if params['load_file']: print("file loaded") self._dqn.load_weights(params['load_file']) def _load_img(self): self.img, self.zone = getimageandzone() def _get_reward(self): tab = [[20, 10, 5, 0, -5, -10, -20], [5, 20, 10, 0, -5, -10, -20], [-5, 5, 20, 0, -5, -10, -20], [-20, -10, 5, 20, 5, -10, -20], [-20, -10, -5, 0, 20, 5, -5], [-20, -10, -5, 0, 10, 20, 5], [-20, -10, -5, 0, 5, 10, 20]] if self._dqn_action is not None: self.reward = tab[self.zone][self._dqn_action] def _train(self): self.terminal = False for i in range(0, self.params['num_training']): # generer une image et la zone correspondante self._load_img() if self._dqn_action is not None: # go backward self._dqn.backward(self.reward, self.terminal) # go forward self._dqn_action = self._dqn.forward(self.img) self._get_reward() self._dqn.step += 1 self.accum_reward = self.accum_reward + self.reward print("Setp : " + str(i) + " \treward : " + str(self.reward) + " \taccumrwd : " + str(self.accum_reward) + " \tzone : " + str(self.zone) + " \taction : " + str(self._dqn_action)) log_file = open("log.txt", 'a') log_file.write("Setp : " + str(i) + " \treward : " + str(self.reward) + " \taccumrwd : " + str(self.accum_reward) + " \tzone : " + str(self.zone) + " \taction : " + str(self._dqn_action) + "\n") # Si avant dernier tour de boucle passer terminal a true if i == self.params['num_training'] - 2: self.terminal = True self._dqn.save_weights(params['save_file']) if i % 1000 == 0: self._dqn.save_weights(params['save_file'] + str(i), True) self.accum_reward = 0 print("Model saved")
def startDummy(env, Comm, tryHard=False): nb_actions = env.action_space.n layer0Size = 4096 layer1Size = 4096 layer2Size = 4096 layer3Size = 0 layer4Size = 0 layer5Size = 1 # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(layer0Size)) model.add(LeakyReLU(alpha=0.003)) model.add(Dense(layer1Size)) model.add(LeakyReLU(alpha=0.003)) model.add(Dense(layer2Size)) model.add(LeakyReLU(alpha=0.003)) model.add(Dense(nb_actions)) model.add(Activation('linear')) #A little diagnosis of the model summary print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=800000, window_length=1) policy = GreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, enable_dueling_network=True) dqn.compile(nadam(lr=0.001), metrics=['mae']) #Load Previous training previousfileLength = 0 #Start traing # Ctrl + C. # We train and store load_file_number = 39 loadFile = "Larger_Memeory_BOARDSIZE_" + str(max_board_size) + "_DQN_LAYERS_" + str(layer0Size) + "_" + str(layer1Size) + "_" + str(layer2Size) + "_" + str(layer3Size) + "_" + str(layer4Size) + "_" + str(layer5Size) + "_SAVENUMBER_" + str(load_file_number) + ".h5f" dqn.load_weights(loadFile) while(True): data = None while data == None: data = Comm.getNewData() observation, notUsed, currSafeMoves, headButtSafeMoves, noStuckMoves, foodMoves = env.findObservation(data=data) action = dqn.forward(observation) if action == 0: moveChosen = 'left' if action == 1: moveChosen = 'right' if action == 2: moveChosen = 'up' if action == 3: moveChosen = 'down' if moveChosen not in currSafeMoves and len(currSafeMoves) > 0: moveChosen = choice(currSafeMoves) if moveChosen not in noStuckMoves and len(noStuckMoves) > 0: moveChosen = choice(noStuckMoves) if moveChosen not in headButtSafeMoves and len(headButtSafeMoves) > 0: moveChosen = choice(headButtSafeMoves) if moveChosen not in foodMoves and len(foodMoves) > 0: moveChosen = choice(foodMoves) Comm.giveNewMove(moveChosen)
# Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. env.is_train = True dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME)) dqn.fit(env, nb_steps=100000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) with open('dqn_action.json', 'w') as fw: observation = status.tolist() action = [ float(actions[dqn.forward(np.array([obs]))]) for obs in observation ] json.dump({'observation': observation, 'action': action}, fw) state_batch = status.reshape([-1, 1, 1]) q_val = pd.DataFrame(dqn.compute_batch_q_values(state_batch)) q_val.to_csv('dqn_qvalue.csv') env.is_train = False env.plot_row = 1 env.plot_col = 5 # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True) env.plt.ioff() env.plt.show()