def evaluate(agent): env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): env.init() env.reset_game() obs = list(env.getGameState().values()) episode_reward = 0 while True: action = agent.predict(obs) observation = env.getScreenRGB() score = env.score() #action = agent.pickAction(reward, observation) observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) cv2.imshow("ss", observation) cv2.waitKey(10) # 预测动作,只选最优动作 reward = env.act(actionset[action]) obs = list(env.getGameState().values()) done = env.game_over() episode_reward += reward if done: break eval_reward.append(episode_reward) cv2.destroyAllWindows() return np.mean(eval_reward)
p.init() #get the current state values(state array) game_current_state = agent.get_current_state(game.getGameState()) #initializing the episode to 0 number_of_episods = 0 #initializing the maximum score variable to 0 maximum_score = 0 #creating a while loop to itaraqte through the episodes while True: #get the optimal action to the current state and store in in the variable maximum_action = agent.get_action(game_current_state) #get the score in the current episode current_score = p.score() #get the maximim score by comparing with the current acore maximum_score = max(current_score, maximum_score) #get the reward value by performing the above action (rward is either 1 or -1000) reward = agent.perform_action(p, maximum_action) #get the next state values (state array) game_next_state = agent.get_current_state(game.getGameState()) #update the Q values by calling the update Q function agent.update_Q_values(game_current_state, game_next_state, reward, maximum_action) #set the next state as current state game_current_state = game_next_state time.sleep(0.01)
def play(size_image): sess = tf.InteractiveSession() img_size = 80 net = NetworkOld(img_size) # open up a game state to communicate with emulator game = flappybird.prepare_game() p = PLE(game, fps=30, display_screen=True) p.init() reward = 0.0 # get the first state by doing nothing and preprocess the image to 80x80x4 actions = p.getActionSet() p.act(actions[1]) s_t = preprocessing.transform_image(p.getScreenRGB(), img_size) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("../saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # start training t = 0 while t < MAX_ITE: if p.game_over(): p.reset_game() terminal = True else: terminal = False # choose an action epsilon greedily readout_t = net.readout.eval(feed_dict={net.s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = np.argmax(readout_t) a_t[action_index] = 1 # run the selected action and observe next state and reward action = int(np.argmax(a_t)) if action == 0: action = 1 else: action = 0 r_t = p.act(actions[action]) s_t1 = preprocessing.transform_image_stacked(p.getScreenRGB(), s_t, img_size) # update the old values s_t = s_t1 t += 1 print("TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t), " / SCORE", p.score())
epochs = 10000000 game_duration = 1000 rewards = [] avg_rewards = [] epsilons = [] steps = [] step = 0 plt.ion() for epoch in range(epochs): p.reset_game() for it in range(1000): if p.game_over(): p.reset_game() print "Score:" + str(p.score()) current_state = game.getGameState() processed_current_state = process_state(current_state) action = agent.act(processed_current_state) reward = p.act(actions[action]) rewards.append(reward) next_state = game.getGameState() game_over = p.game_over() processed_next_state = process_state(next_state) agent.remember(processed_current_state, action, reward, processed_next_state, game_over)
import numpy as np from ple import PLE from ple.games.waterworld import WaterWorld # lets adjust the rewards our agent recieves rewards = { "tick": -0.01, # each time the game steps forward in time the agent gets -0.1 "positive": 1.0, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=256, height=256, num_creeps=8) p = PLE(game, fps=15, force_fps=False, display_screen=True, reward_values=rewards) # we pass in the rewards and PLE will adjust the game for us p.init() actions = p.getActionSet() for i in range(1000): if p.game_over(): p.reset_game() action = actions[np.random.randint(0, len(actions))] # random actions reward = p.act(action) print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
def q_learning(file_name=None, plot=False, gap_division=3, gamma=0.75, epsilon=0.9, batch_size=128, reward_weight_decision=True, buffer_size=5000): os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" game = FlappyBird(width=game_width, height=game_height, pipe_gap=game_pipe_gap) p = PLE(game, frame_skip=6) p.init() last_state = None last_action = 0 last_actions_q_values = [0, 0] last_score = 0 buffer = [] episode = 0 network = Network(batch_size, gamma, epsilon, gap_division) if file_name is not None: network.load(file_name, rename=True) else: leaky_option_hidden_layers, leaky_option_last_layer = False, False activation_hidden_layers = input( "Enter the activation function for the hidden layers (leave empty for default activation (relu)) \n" ) activation_hidden_layers = "relu" if activation_hidden_layers == "" else activation_hidden_layers if activation_hidden_layers == "leaky relu": alpha_relu = input( "Enter alpha value for relu activation (0.3 by default)\n") if alpha_relu == "0.3" or alpha_relu == "": activation_hidden_layers = LeakyReLU(alpha=0.3) else: activation_hidden_layers = LeakyReLU(alpha=float(alpha_relu)) leaky_option_hidden_layers = True activation_last_layer = input( "Enter the activation function for the last layer (leave empty for default activation (linear)) \n" ) activation_last_layer = "linear" if activation_last_layer == "" else activation_last_layer if activation_last_layer == "leaky relu": alpha_relu = input( "Enter alpha value for relu activation (0.3 by default)\n") if alpha_relu == "0.3" or alpha_relu == "": activation_last_layer = LeakyReLU(alpha=0.3) else: activation_last_layer = LeakyReLU(alpha=float(alpha_relu)) leaky_option_last_layer = True weight_initializer = input( "Enter weight initializer (leave empty for default value (glorot_uniform)) \n" ) weight_initializer = "glorot_uniform" if weight_initializer == "" else weight_initializer bias_initializer = input( "Enter bias initializer (leave empty for default value (glorot_uniform)) \n" ) bias_initializer = "glorot_uniform" if bias_initializer == "" else bias_initializer loss_func = input( "Enter loss function (leave empty for default value (binary_crossentropy)) \n" ) loss_func = "binary_crossentropy" if loss_func == "" else loss_func optimizer = input( "Enter the optimizer for neural network (leave empty for default value (Adadelta)) or (Adadelta/RMSprop/SGD/Nadam) \n" ) optimizer = "Adadelta" if optimizer == "" else optimizer optimizer_parameters = set_optimizer_parameters(optimizer) network.create_layers( activation_hidden_layers=activation_hidden_layers, activation_last_layer=activation_last_layer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, loss_function=loss_func, optimizer=optimizer, optimizer_parameters=optimizer_parameters, leaky_hidden_layers=leaky_option_hidden_layers, leaky_last_layer=leaky_option_last_layer) while 1: if p.game_over(): # restart the game p.reset_game() # count episodes episode += 1 if episode % 1000 == 0: network.save_file() # update plot print( f'\n episode={episode}, epsilon={epsilon}, buffer_size={len(buffer)}, score={last_score}' ) if plot is True: plt.scatter(episode, last_score) plt.pause(0.001) print(f'\n episode={episode}, score={last_score}') # adding the last entry correctly label = last_actions_q_values label[last_action] = -1000 if len(buffer) < buffer_size: buffer += [(last_state, label)] else: buffer = buffer[1:] + [(last_state, label)] # reset all last_state = None last_action = 0 last_actions_q_values = [0, 0] last_score = 0 # look at the current state current_state = p.getGameState() current_score = p.score() # compute the actions' Q values actions_q_values = network.Q(current_state).tolist() # Compute the label for the last_state reward = get_reward(state=current_state, gap_division=gap_division, reward_weight_decision=reward_weight_decision) max_q = max(actions_q_values) label = last_actions_q_values if current_score - last_score > 0: label[last_action] = (current_score - last_score) * 1000 else: label[last_action] = reward + gamma * max_q # not taking the first state into consideration if last_state is not None: # Update buffers if len(buffer) < buffer_size: buffer += [(last_state, label)] else: buffer = buffer[1:] + [(last_state, label)] # train if len(buffer) >= batch_size: sample = random.sample(buffer, batch_size) network.train(sample) # choose the optimal action with a chance of 1 - epsilon actions_indexes = np.arange(len(actions_q_values)) optimal_action_to_take = np.argmax(actions_q_values) random_action = np.random.choice(actions_indexes) if np.random.uniform() < epsilon: action = random_action else: action = optimal_action_to_take # act accordingly p.act(None if action == 0 else 119) # update epsilon if epsilon > 0.1: epsilon = epsilon - 0.00000075 # remember everything needed from the current state last_action = action last_state = current_state last_actions_q_values = actions_q_values last_score = current_score # Log sys.stdout.write( f'\rBottom: {game_height - current_state["next_pipe_bottom_y"]}, Top: {game_height - current_state["next_pipe_top_y"]}, Bird: {game_height - current_state["player_y"]}, Reward: {reward}' ) sys.stdout.flush()
if __name__ == '__main__': reward = 0 steps = 1000 epoch = 0 limit = 100 la = LearningAgent(list(game.getActions())) la.brain.load() scores = [] i = 0 while epoch <= limit: # We want to train i += 1 state = list(p.getGameState().values()) reward = p.score() #print(reward) action = la.brain.update(reward, state) la.pickAction(action) if i > steps: print(epoch) epoch += 1 la.brain.save() scores.append(la.brain.score()) plt.plot(scores) plt.savefig("RewardGraph.png") i = 0 la.brain.save() plt.show()
def evaluate(agent1, agent2, agent3): input("开始比赛") fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') frame_number = 0 env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): output_movie = cv2.VideoWriter(videoname + '_' + str(i) + '.mp4', fourcc, 20, (288, 512)) env.init() env.reset_game() dstate = env.getGameState() # print(dstate) obs = list(dstate.values()) last_obs = np.zeros_like(obs[0:8]) episode_reward = 0 while True: obs1 = obs[0:8] obs2 = obs[8:16] obs3 = obs[16:24] action1 = agent1.predict(obs1) action2 = agent2.predict(obs2) action3 = agent3.predict(last_obs, obs3) finalaction = 0 if action1 == 0: finalaction += 1 if action2 == 0: finalaction += 2 if action3 == 0: finalaction += 4 # print("action1: ", action1) # print("action2: ", action2) # print("action3: ", action3) # print("action: ", finalaction) # print(obs) # print(obs1) # print(obs2) # print(obs3) if finalaction == 0: finalaction = None score = env.score() observation = env.getScreenRGB() observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) ss = observation.shape observation = cv2.resize(observation, (ss[1] * 2, ss[0] * 2)) output_movie.write(observation) cv2.imshow("ss", observation) cv2.waitKey(30) # 预测动作,只选最优动作 reward = env.act(finalaction) last_obs = obs3 dstate = env.getGameState() # print(dstate) obs = list(dstate.values()) done = env.game_over() episode_reward += reward if done: break # input() eval_reward.append(episode_reward) cv2.destroyAllWindows() output_movie.release() input() return np.mean(eval_reward)
class Bot(): """ This is our Test agent. It's gonna pick some actions after training! """ def __init__(self, lr): self.lr = lr self.game = Pixelcopter(width=480, height=480) self.p = PLE(self.game, fps=60, display_screen=True) self.actions = self.p.getActionSet() #def pickAction(self, reward, obs): # return random.choice(self.actions) def frame_step(self, act_inp): terminal = False reward = self.p.act(act_inp) if self.p.game_over(): self.p.reset_game() terminal = True reward = -1 else: reward = 1 self.score = self.p.score() img = self.p.getScreenGrayscale() img = transform.resize(img, (80, 80)) img = exposure.rescale_intensity(img, out_range=(0, 255)) img = img / 255.0 return img, reward, terminal def build_model(self): print("Building the model..") model = Sequential() model.add( Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same', input_shape=(img_rows, img_cols, img_channels))) #80*80*4 model.add(Activation('relu')) model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same')) model.add(Activation('relu')) model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same')) model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dense(2)) adam = Adam(lr=self.lr) model.compile(loss='mse', optimizer=adam) self.model = model print("Finished building the model..") def trainNetwork(self, mode): D = deque() x_t, r_0, terminal = self.frame_step(self.actions[1]) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #print (s_t.shape) #need to reshape for keras s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 if mode == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") self.model.load_weights("model.h5") adam = Adam(lr=self.lr) self.model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON t = 0 while (True): loss = 0 Q_sa = 0 action_index = 0 r_t = 0 #choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(num_actions) chosen_act = self.actions[action_index] else: q = self.model.predict( s_t) #input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q chosen_act = self.actions[action_index] #We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #run the selected action and observed next state and reward x_t1, r_t, terminal = self.frame_step(chosen_act) x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1 s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() #only train if done observing if t > OBSERVE: #sample a minibatch to train on minibatch = random.sample(D, BATCH) #Now we do the experience replay state_t, action_t, reward_t, state_t1, terminal = zip( *minibatch) state_t = np.concatenate(state_t) state_t1 = np.concatenate(state_t1) targets = self.model.predict(state_t) Q_sa = self.model.predict(state_t1) targets[range(BATCH), action_t] = reward_t + GAMMA * np.max( Q_sa, axis=1) * np.invert(terminal) loss += self.model.train_on_batch(state_t, targets) s_t = s_t1 t = t + 1 # save progress every 10000 iterations if t % 1000 == 0: print("Now we save model") self.model.save_weights("model.h5", overwrite=True) with open("model.json", "w") as outfile: json.dump(self.model.to_json(), outfile) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss) print("Episode finished!") print("************************") def playGame(self, mode): self.build_model() self.trainNetwork(mode) def main(self): modes = ["Train", "Run"] mode = modes[input("Do you wanna Train(0) or Run(1): ")] self.playGame(mode)
from ple.games.waterworld import WaterWorld # lets adjust the rewards our agent recieves rewards = { "tick": -0.01, # each time the game steps forward in time the agent gets -0.1 "positive": 1.0, # each time the agent collects a green circle "negative": -5.0, # each time the agent bumps into a red circle } # make a PLE instance. # use lower fps so we can see whats happening a little easier game = WaterWorld(width=256, height=256, num_creeps=8) p = PLE(game, fps=15, force_fps=False, display_screen=True, reward_values=rewards) # we pass in the rewards and PLE will adjust the game for us p.init() actions = p.getActionSet() for i in range(1000): if p.game_over(): p.reset_game() action = actions[np.random.randint(0, len(actions))] # random actions reward = p.act(action) print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
return self.actions[1] elif fwd[1] < 0 and abs(fwd[1]) > abs(fwd[0]): return self.actions[2] elif fwd[0] < 0 and abs(fwd[0]) > abs(fwd[1]): return self.actions[3] else: return self.actions[4] os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # create our game force_fps = True # slower speed display_screen = False game = WaterWorld() # make a PLE instance. p = PLE(game,force_fps=force_fps) # init agent and game. p.init() p.display_screen = True reward = 0 agent = MyAgent(p.getActionSet()) while p.game_over() == False: state = p.getGameState() action = agent.pickAction(reward, state) reward = p.act(action) print p.score()
def train(): game = Snake(600, 600) p = PLE(game, fps=60, state_preprocessor=process_state, force_fps=True, display_screen=False, frame_skip=2, reward_values={ "positive": 100.0, "negative": -50.0, "tick": -0.1, "loss": -110.0, "win": 5.0 }) agent = Agent(alpha=float(sys.argv[1]), gamma=float(sys.argv[2]), n_actions=3, epsilon=0.99, batch_size=100, input_shape=6, epsilon_dec=0.99999, epsilon_end=0.001, memory_size=500000, file_name=sys.argv[3], activations=[str(sys.argv[4]), str(sys.argv[5])]) p.init() # agent.load_game() scores = [] for _ in range(100000): if p.game_over(): p.reset_game() score = 0 initial_direction = "Right" while not p.game_over(): old_state = np.array( vision(list(p.getGameState()[0]), initial_direction)) action = agent.choose_action(old_state) possible_directions = prepare_corect_directions(initial_direction) possible_directions_tuples = list( zip(possible_directions.keys(), possible_directions.values())) direction = possible_directions_tuples[action] initial_direction = direction[1] reward = p.act(direction[0]) new_state = np.array( vision(list(p.getGameState()[0]), initial_direction)) agent.add_experience(old_state, action, reward, new_state) agent.learn() score = p.score() scores.append(score) print( f"Score for model iteration number _ {str(sys.argv[3])} with learning_rate {sys.argv[1]}, gama {sys.argv[2]}, activations: {sys.argv[4], sys.argv[5]} is score {score}. Epsilon is {agent.epsilon}" ) agent.save_game()
class FlappyBirdEnvironment(Environment): def __init__(self): env = FlappyBird() self.p = PLE(env, add_noop_action=True) self.p.init() self.win_score = 10. action_space = len(self.p.getActionSet()) state_space = len(self.p.getGameState()) actions = ["up", "nothing"] state_names = list(self.p.getGameState().keys()) Environment.__init__(self, env, action_space, state_space, actions, state_names) def reset_environment(self): self.p.reset_game() def get_state(self) -> np.array: state = list(self.p.getGameState().values()) state = np.array(state) return state def get_normalized_state(self) -> np.array: """Get the current state of the environment with each state attribute normalized in [0, 1], ready to be fed to a NN. Returns: The current normalized state (np.array) """ state = self.get_state() states_mins = np.array([0., -10., 0., 0., 103., 103., 0., 103.]) states_maxs = np.array([410., 10., 288., 205., 308., 410., 205., 308.]) state = (state - states_mins) / (states_maxs - states_mins) return state def environment_step(self, action: int) -> (np.array, int, bool): """Do a move in the environment. Args: action: The action to take Returns: The next state, the reward obtained by doing the action, and if the environment is terminated """ p_action = self.p.getActionSet()[action] reward = self.p.act(p_action) done = self.p.game_over() if self.p.score() >= self.win_score: done = True next_state = self.get_state() return next_state, reward, done def render_environment(self): self.p.display_screen = True self.p.force_fps = False def pass_test(self, rewards: List[float]): if np.mean(rewards) >= self.win_score: return True else: return False def close(self): pygame.quit() def win_condition(self, episode: Episode): if episode.total_reward >= self.win_score: return True else: return False
if __name__ == '__main__': #p.init() #do I even need this? Kaio didn't seem to be using it for naive agent print(game.getActions()) tresh = False reward = 0 steps = 1000 la = LearningAgent(list(game.getActions())) #where is the documentation for extract_image? I imagine it comes from utils snapshot = extract_image(p.getScreenRGB(), (80,80), thresh=thresh) #what is this stack_snaps = np.stack((snapshot, snapshot, snapshot, snapshot), axis=0) while p.game_over() == False: snapshot = extract_image(p.getScreenRGB(), (80, 80), thresh=thresh) snapshot = np.reshape(snapshot, (1, 80, 80)) st = np.append(stack_snaps[1:4, :, :], snapshot, axis=0) #what does st stand for? if train: reward, action, _, _, _ = train_and_play(p_action, st, select_action, perform_action, possible_actions, optimize, None, {}) push_to_memory(stack_snaps, action, st, reward) else: play(p_action, st, select_action, perform_action, possible_actions, None, {}) stack_snaps = st score = p.score() p.reset_game() if train: save_model(save_path) #return score #how to do this when this isn't technically a function?
# Initialisation du jeu game = Snake(height=case_size * size, width=case_size * size) p = PLE(game, fps=30, display_screen=True) agent = Trainer(allowed_actions=p.getActionSet(), height=game.height, width=game.width) p.init() reward = 0.0 nb_frames = 10000000000000000 bestScore = 0 for i in range(nb_frames): if (p.score() > bestScore): bestScore = int(p.score()) print('New Best Score : ' + str(bestScore) + ' a ' + str(datetime.datetime.now())) if p.game_over(): p.reset_game() observation = p.getGameState() food_location = [ int(observation.get('food_x') / 10), int(observation.get('food_y') / 10) ] snake_location = [ int(observation.get('snake_head_x') / 10), int(observation.get('snake_head_y') / 10)
class UMDAc(): def __init__(self, gen_size, net_size, activation, env, max_steps=None, seed=0, action_mode='argmax', iterations=1, display_info=False): ## Global variables self.gen_size = gen_size self.net_size = net_size self.activation = activation self.iterations = iterations self.seed = seed self.max_steps = max_steps ## Detect environment, OpenAI or PLE try: ## Environment is from OpenAI Gym self.state_size = env.observation_space.shape[0] self.openai = True self.ple = False self.env = env ## Environment try: ## Size of action vector agent can take self.action_size = env.action_space.n except: ## Size of action vector agent can take self.action_size = env.action_space.shape[0] except: ## Environment is from PLE self.openai = False self.ple = True self.game = env ## Init environment self.env = PLE(self.game, fps=30, display_screen=True, rng=0) ## Allowed actions set self.allowed_actions = list(self.env.getActionSet()) self.action_size = len(self.allowed_actions) #self.state_size = len(self.game.getGameState()) self.state_size = self._ple_get_state().shape[1] if display_info: ## Print environment info print('\n' + '#' * 5, ' Environment data: ', '#' * 5) print('Type (Autodected): ', 'Gym' if self.openai else 'PLE') print('State size: ', self.state_size) print('Action size: ', self.action_size) print('') print('Iterations: ', self.iterations) print('') ''' ACTION MODE: Determines how output data from neural network will be treated. Three options: - raw - argmax - tanh ''' self.action_mode = action_mode self.fitness = {} # Init fitness log ## Create first generation randomly self.gen = {} # Init generation 0 ## Create random specimens for i in range(gen_size): ## Generate specimen weights and biases specimen = {} ## First layer specimen['h0'] = np.random.uniform(-1, 1, [self.state_size, net_size[0]]) specimen['b0'] = np.random.uniform(-1, 1, [1, net_size[0]]) ## Intermediate layers h_i = 1 for layer in net_size[1:]: ## Generate hidden layers and biases specimen['h' + str(h_i)] = np.random.uniform( -1, 1, [net_size[h_i - 1], net_size[h_i]]) specimen['b' + str(h_i)] = np.random.uniform( -1, 1, [1, net_size[h_i]]) h_i += 1 ## Last layer specimen['h' + str(h_i)] = np.random.uniform( -1, 1, [net_size[h_i - 1], self.action_size]) specimen['b' + str(h_i)] = np.random.uniform( -1, 1, [1, self.action_size]) ## Add specimen to generation self.gen['s' + str(i)] = specimen ## Add specimen to fitness log, init with fitness ## value of 0 self.fitness['s' + str(i)] = 0. ## Create a dictionary to hold new specimens self.new = {} ## First new specimen (reference specimen) reference = {} reference['h0'] = np.empty([self.state_size, net_size[0]]) reference['b0'] = np.empty([1, net_size[0]]) ## Intermediate layers h_i = 1 for layer in net_size[1:]: ## Generate hidden layers and biases reference['h' + str(h_i)] = np.empty( [net_size[h_i - 1], net_size[h_i]]) reference['b' + str(h_i)] = np.empty([1, net_size[h_i]]) h_i += 1 ## Last layer reference['h' + str(h_i)] = np.empty( [net_size[h_i - 1], self.action_size]) reference['b' + str(h_i)] = np.empty([1, self.action_size]) ## Add reference to dict self.new['n0'] = reference def show(self, name, show_weights=False): ## For every layer in specimen for l_i in range(int(len(self.gen[name]) / 2)): ## Print info about layer and bias print('-' * 5, " layer Nº", str(l_i), ' ', '-' * 5) print(' * Neurons: ', self.gen[name]['h' + str(l_i)].shape[1], '\n', '* Weights of each neuron: ', self.gen[name]['h' + str(l_i)].shape[0], '\n', '* Biases: ', self.gen[name]['b' + str(l_i)].shape[1], '\n') if show_weights: ## Show weight values print("* Weights:") print(self.gen[name]['h' + str(l_i)]) print("* Biases:") print(self.gen[name]['b' + str(l_i)]) print('') def pass_forward(self, feature, specimen): in_data = feature ## Load input data for l_i in range(int(len(specimen) / 2)): ## Pass through weights and sum h_z = np.dot(in_data, specimen['h' + str(l_i)]) + specimen['b' + str(l_i)] ## Activation function h_a = self.activation(h_z) ## Pass data to next layer in_data = h_a ## Return las activation return h_a def gym_evaluate(self, specimen, render=False, time_sleep=.0): seed = self.seed ## Initial random seed reward_log = [] ## For later use in total reward sum if iterations > 1 for iters in range(self.iterations): ## Reset environment self.env.seed(seed) state = self.env.reset() t_reward = 0 ## Reset total reward if self.max_steps != None: ## Finite time steps for step in range(self.max_steps): ## Render env if render: self.env.render() ## Pass forward state data output = self.pass_forward(state, specimen) ## Format output to use it as next action if self.action_mode == 'argmax': action = np.argmax(output[0]) elif self.action_mode == 'raw': action = output[0] elif self.action_mode == 'tanh': action = np.tanh(output[0]) ## Run new step state, reward, done, _ = self.env.step(action) time.sleep(time_sleep) ## Wait time ## Add current reard to total t_reward += reward if done: break ## Used if iterations > 1 reward_log.append(t_reward) ## Update seed to test agent in different scenarios seed += 1 else: ## Test agent until game over done = False while not done: ## Render env if render: self.env.render() ## Pass forward state data output = self.pass_forward(state, specimen) ## Format output to use it as next action if self.action_mode == 'argmax': action = np.argmax(output[0]) elif self.action_mode == 'raw': action = output[0] elif self.action_mode == 'tanh': action = np.tanh(output[0]) ## Run new step state, reward, done, _ = self.env.step(action) time.sleep(time_sleep) ## Wait time ## Add current reard to total t_reward += reward ## End game if game over if done: break ## Used if iterations > 1 reward_log.append(t_reward) seed += 1 ## Update random seed ## Disable random seed ''' This prevents the algorithm to generate the same random numbers all time. ''' np.random.seed(None) ## Sum of total rewards in all iterations return sum(reward_log) def _ple_get_state(self): ## Adapt game observation to ## useful state vector observation = self.game.getGameState() state = [] for item in observation: data = observation[item] if type(data) is dict: for d in data: inf = np.array(data[d]).flatten() for dt in inf: state.append(dt) elif type(data) is list: data = np.array(data).flatten() for val in data: state.append(val) else: state.append(data) return np.array([state]) def ple_evaluate(self, specimen, time_sleep=.0): ## Set initial random seed np.random.seed(self.seed) class MyRandom(): def __init__(self, seed): pass #np.random.seed(seed) #np.random.seed(0) #self.seed = seed def random_sample(self, size=None): return np.random.random_sample(size) def choice(self, a, size=None, replace=True, p=None): return np.random.choice(a, size, replace, p) def random_integers(self, rmin, rmax): return np.random.randint(rmin, rmax) def uniform(self, low=0.0, high=1.0, size=None): return np.random.uniform(low, high, size) def rand(self): return np.random.rand() reward_log = [] ## Log of all total rewards if self.max_steps != None: for i in range(self.iterations): ## Initialize game self.game.rng = MyRandom(self.seed) self.game.init() ## Reset game t_reward = .0 ## Reset total reward for time_step in range(self.max_steps): ## Get state state = self._ple_get_state() ## Output from specimen for given state output = self.pass_forward(state, specimen) ## Covert specimen output to action act = self.allowed_actions[np.argmax(output[0])] ## Take action self.env.act(act) ## Wait time useful if render is enabled time.sleep(time_sleep) ## Update total reward t_reward = self.env.score() ## End game if game over if self.env.game_over(): break ## Log reward for later sum reward_log.append(t_reward) else: ## Finite number of time for i in range(self.iterations): ## Initialize game self.game.rng = MyRandom(self.seed) self.game.init() t_reward = .0 ## Reset total reward while not self.env.game_over(): ## Get state state = self._ple_get_state() ## Take action output = self.pass_forward(state, specimen) act = self.allowed_actions[np.argmax(output[0])] self.env.act(act) ## Useful if random enabled time.sleep(time_sleep) ## Update total reward t_reward = self.env.score() ## Log all total rewards reward_log.append(t_reward) ## Disable random seed ''' This prevents the algorithm to generate the same random numbers all time. ''' np.random.seed(None) ## Sum all total rewards return sum(reward_log) def train(self, n_surv, n_random_surv): ## Collect data about generation survivors = list(self.fitness.keys()) ## Survivors' names survivors_fitness = list( self.fitness.values()) ## Survivors's fitnesses worsts = [] ## Worst specimens names worsts_fitness = [] ## Worst specimens fitness values ## Select best fitness survivors n_r = len(survivors) - n_surv ## Number of not survivor specimens for n in range(n_r): ## Select worst specimen indx = survivors_fitness.index(min(survivors_fitness)) ## Save worsts worsts.append(survivors[indx]) worsts_fitness.append(survivors_fitness[indx]) ## Delete worsts from survivors lists del survivors[indx] del survivors_fitness[indx] ## Randomly select bad specimens to survive for i in range(n_random_surv): ## Random index indx = np.random.randint(len(worsts)) ## Add random specimen to survivors survivors.append(worsts[indx]) survivors_fitness.append(worsts_fitness[indx]) ## Update worst specimens' lists del worsts[indx] del worsts_fitness[indx] ## Generate new specimens (empty): for i in range(len(worsts)): self.new['n' + str(i)] = copy.deepcopy(self.gen['s0']) for param in self.gen['s0']: ## For each parameter for i in range(self.gen['s0'][param].shape[0]): for j in range(self.gen['s0'][param].shape[1]): ## layer[i][j] weight of each survivor w = [] ## For each survivor for name in survivors: w.append(self.gen[name][param][i][j]) ## NOTE: Experimental #n_mut = int(len(w)*.3) #muts = np.random.rand(n_mut) #w = np.array(w) #np.random.shuffle(w) # #w = np.delete(w, range(len(w)-n_mut, len(w)), 0) #w = np.hstack((w, muts)) #np.random.shuffle(w) ## END OF NOTE ## Compute weights list's mean mean = np.mean(w) ## Standard deviation std = np.std(w) ## Get samples samples = np.random.normal(mean, std, len(worsts)) i_sample = 0 ## Iterator ## Generate new specimens for name in self.new: ## Update weight self.new[name][param][i][j] = samples[i_sample] i_sample += 1 ## After generating a set of new specimens new_names = [] new_fitness = [] for name in self.new: ## Load specimen specimen = self.new[name] ## Evaluate new specimens ## and store data for later comparison new_names.append(name) if self.openai: new_fitness.append(self.gym_evaluate(specimen)) elif self.ple: new_fitness.append(self.ple_evaluate(specimen)) ''' Selection. Replace all specimens in the worsts list with best specimens of the to_select lists. ''' to_select_names = new_names + worsts to_select_fitness = new_fitness + worsts_fitness for i in range(len(worsts)): indx = np.argmax(to_select_fitness) ## Add selected specimen to new generation if 'n' in to_select_names[indx]: ## Replace specimen self.gen[worsts[i]] = copy.deepcopy( self.new[to_select_names[indx]]) else: ## Replace specimen self.gen[worsts[i]] = copy.deepcopy( self.gen[to_select_names[indx]]) ## Update selection lists del to_select_names[indx] del to_select_fitness[indx] def add_neurons(self, layer_name, n_neurons=1): ## To all specimens in generation for name in self.gen: ## Load specimen specimen = self.gen[name] last_indx = int(len(specimen) / 2) - 1 ## Number of layers sel_indx = int(layer_name[1]) ## Selected layer's index ## Add neuron to layer new_neuron = np.random.rand(specimen[layer_name].shape[0], n_neurons) specimen[layer_name] = np.hstack( (specimen[layer_name], new_neuron)) ## Add new bias new_bias = np.random.rand(1, n_neurons) specimen['b' + str(sel_indx)] = np.hstack( (specimen['b' + str(sel_indx)], new_bias)) ## Check if the selected layer is ## the last (output layer) of the net if sel_indx != last_indx: next_layer = specimen['h' + str(sel_indx + 1)] ## Selected layer isn't the last ## Generate new weights new_w = np.random.rand(n_neurons, next_layer.shape[1]) ## Add weights to next layer specimen['h' + str(sel_indx + 1)] = np.vstack( (new_w, next_layer)) def add_layer(self, n_neurons): ## Add one layer to all specimens ## The new layer is added before ## the output layer ## Define network's layers specimen = self.gen['s0'] layers = [] layers_shape = [] biases = [] biases_shape = [] for l in specimen: if 'h' in l: layers.append(l) layers_shape.append(specimen[l].shape) elif 'b' in l: biases.append(l) biases_shape.append(specimen[l].shape) for name in self.gen: ## Load specimen specimen = self.gen[name] ## Reset output layer new_o = np.random.rand(n_neurons, self.action_size) ## Reset output layer bias new_o_b = np.random.rand(1, self.action_size) ## Create new layer new_l = np.random.rand(layers_shape[-2][1], n_neurons) new_l_b = np.random.rand(1, n_neurons) specimen[layers[-1]] = new_l specimen[biases[-1]] = new_l_b specimen['h' + str(len(layers))] = new_o specimen['b' + str(len(biases))] = new_o_b def save_specimen(self, specimen, filename='specimen0.txt'): ## Open file f = open(filename, 'w') ## Write layers for layer in specimen: f.write(layer + '\n') f.write(str(specimen[layer].tolist()) + '\n') f.close() # Close file def load_specimen(self, filename): import ast ## Open file f = open(filename, 'r') ## Init specimen specimen = {} ## Read file array = False for line in f.readlines(): line = line.split('\n')[0] if array: ## Covert string to np array layer = np.array(ast.literal_eval(line)) specimen[layer_name] = layer ## Add layer array = False else: layer_name = line array = True f.close() ## Close return specimen
# 加载模型 save_path = '.\model_dir\model_6700_2823.0.ckpt' #episode_reward: 1785.0 agent.restore(save_path) obs = list(env.getGameState().values()) # #处理obs # obs = preprocess(obs) episode_reward = 0 while True: # 预测动作,只选最优动作 action = agent.predict(obs) # 图像太快休眠 # time.sleep(0.02) #延迟单位为秒 # # 新建窗口显示分数 observation = env.getScreenRGB() score = env.score() # 格式转换 observation = cv2.cvtColor(observation, cv2.COLOR_RGB2BGR) # 选择90度 observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, "score:" + str(int(score)), (0, 30), font, 0.6, (0, 0, 255), 2) cv2.imshow("flappybird", observation) cv2.waitKey(5) reward = env.act(actionset[action]) obs = list(env.getGameState().values()) # #处理obs # obs = preprocess(obs) done = env.game_over()
class Environment(): def __init__(self, device, display=True): # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } self.env = PLE(FlappyBird(), display_screen=display, reward_values=reward_values) self.device = device self.action_set = self.env.getActionSet() self.frames = [] def reset(self): self.env.reset_game() def start(self): self.env.act(0) obs = convert(self.env.getScreenGrayscale()) self.state = np.stack([[obs for _ in range(4)]], axis=0) self.t_alive = 0 self.total_reward = 0 return self.state def game_over(self): return self.env.game_over() def getScore(self): return self.env.score() def step(self, action): reward = self.env.act(self.action_set[action]) # make next state obs = convert(self.env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) next_state = np.append(self.state[:, 1:, ...], obs, axis=1) self.t_alive += 1 self.total_reward += reward self.state = next_state return self.state, reward, self.env.game_over() def get_screen(self): return self.env.getScreenRGB() def record(self): self.frames.append(self.env.getScreenRGB()) def saveVideo(self, episode, video_path): os.makedirs(video_path, exist_ok=True) clip = make_video(self.frames, fps=60).rotate(-90) clip.write_videofile(os.path.join(video_path, 'env_{}.mp4'.format(episode)), fps=60) print('Episode: {} t: {} Reward: {:.3f}'.format( episode, self.t_alive, self.total_reward))
# print "Action = ", action reward = p.act(p.getActionSet()[action]) # print "Reward = ", reward if p.game_over(): episode_over = True # print ">>>DEAD!" observation = game.getGameState() observation = ((int(observation["player_y"]) - int(observation["next_pipe_bottom_y"])), int(observation["next_pipe_dist_to_player"]), int(observation["player_vel"])) # print "Next observation = ", observation agent.update(action, reward, observation, episode_over) if episode_over: batch_sum += frame_count episode_count += 1 if episode_count % 100 == 0: output.write("Episode " + str(episode_count) + ", Score = " + str(p.score()) + ", Avg Frames survived = " + str(batch_sum / 100) + "Q Size = " + str(len(agent.q)) + "\n") print "Episode ", episode_count, ", Score = ", p.score(), ", Avg Frames survived = ", batch_sum / 100, "Q Size = ", len(agent.q) batch_sum = 0 if p.score > max_score: max_score = p.score # q_table = copy.deepcopy(agent.q) q_table = dict(agent.q) pickle.dump(q_table, open("agent_q.p", "w")) p.reset_game() observation = game.getGameState() observation = ((int(observation["player_y"]) - int(observation["next_pipe_bottom_y"])), int(observation["next_pipe_dist_to_player"]), int(observation["player_vel"])) agent.state = observation frame_count = 0 # print "observation = ", observation # print "reward = ", reward
# 训练次数 episodes = 20000 # 实例化游戏对象 game = FlappyBird() # 类似游戏的一个接口,可以为我们提供一些功能 p = PLE(game, fps=30, display_screen=True) # 初始化 p.init() # 实例化Agent,将动作集传进去 agent = Agent(p.getActionSet()) for episode in range(episodes): # 重置游戏 p.reset_game() # 获得状态 state = agent.get_state(game.getGameState()) while True: # 获得最佳动作 action = agent.get_best_action(state) # 然后执行动作获得奖励 reward = agent.act(p, action) # 获得执行动作之后的状态 next_state = agent.get_state(game.getGameState()) state = next_state if p.game_over(): print("当前分数为{}".format(p.score())) break # 让小鸟慢一点 time.sleep(0.02)
processed_state.append(creep[1]) return np.array((processed_state, )) p.init() actions = p.getActionSet()[:-1] agent = Agent(len(actions)) epochs = 10000000 game_duration = 1000 for epoch in range(epochs): p.reset_game() for it in range(1000): if p.game_over(): p.reset_game() print "Finished with score:" + str(p.score()) current_state = game.getGameState() processed_current_state = process_state(current_state) action = agent.act(processed_current_state) # action = actions[np.random.randint(0, len(actions))] reward = p.act(actions[action]) next_state = game.getGameState() game_over = p.game_over() print "Current score: " + str(p.score()) print "Finished with score:" + str(p.score())
p.state_preprocessor = agent.process_state #agent.load("model.h5") #agent.epsilon = 0.05 fail, catch, j = 0, 0, 0 best_score = -np.inf nb_games = 1 while 1: j += 1 # On réinitialise de temps en temps if p.game_over() or j == 50000: fail, catch, j = 0, 0, 0 best_score = max(best_score, p.score()) nb_games += 1 p.reset_game() observation = p.getGameState() action = agent.pickAction(observation) reward = p.act(action_set[action]) if reward < -0.5: fail += 1 if reward > 0.5: catch += 1 agent.remember(observation, action, reward, p.getGameState(), p.game_over())