class TrainPipeline(): save_ParaFreq = 200 MAX_EPISODES = 2000 def __init__(self, flag_is_shown=False, flag_is_train=True): # training params self.flag_is_shown = flag_is_shown self.flag_is_train = flag_is_train self.game = Game(self.flag_is_shown, self.flag_is_train) self.NN = PolicyValueNet( (4, self.game.board_width, self.game.board_height)) if not self.flag_is_train: self.NN.load_model("./paras/policy.model") self.mcts_player = MCTSPlayer(self.NN.propagation) def train(self): """run the training pipeline""" for episode in range(self.MAX_EPISODES): if self.flag_is_train: winner, play_data = self.game.start_self_play(self.mcts_player) self.NN.memory(play_data) if len(self.NN.data_buffer) > self.NN.batch_size: loss = self.NN.policy_update() else: print( "Collecting data: %d%%, " % (len(self.NN.data_buffer) / self.NN.batch_size * 100), end="") # and save the model params if (episode + 1) % self.save_ParaFreq == 0: self.NN.save_model('./paras/policy.model') print("episode = %d" % episode) else: self.game.start_play(self.mcts_player)
def __init__(self, flag_is_shown=False, flag_is_train=True): # training params self.flag_is_shown = flag_is_shown self.flag_is_train = flag_is_train self.game = Game(self.flag_is_shown, self.flag_is_train) self.NN = PolicyValueNet( (4, self.game.board_width, self.game.board_height)) if not self.flag_is_train: self.NN.load_model("./paras/policy.model") self.mcts_player = MCTSPlayer(self.NN.propagation)
def main(): global Net #global epsilon global n_cpu global test_text port = 4001 test_mode = False Net = Net_dnc if len(sys.argv) > 1: cmd = sys.argv[1] for c in list(cmd): if c == 't': test_mode = True elif c == 'T': test_mode = True test_text = True elif c == 'p': port = int(sys.argv[2]) print("port:", port) #elif c == 'e': # epsilon = float(sys.argv[2]) # print("epsilon:", epsilon) elif c == 'd': Net = Net_dnc elif c == 'l': Net = Net_lstm elif c == 'a': Net = Net_avg elif c == 'n': Net = Net_none elif c == '1': n_cpu = 1 print("Using:", Net) #net = Net(len(env.symbols), len(env.actions), len(env.objects)) net = Net(1254, 6, 36) #Try to load from file if os.path.isfile(name): print("Loading from file..") net.load_state_dict(torch.load(name)) if not test_mode: net.share_memory() processes = [] for rank in range(n_cpu): p = mp.Process(target=train, args=(net, rank)) p.start() processes.append(p) for p in processes: p.join() else: env = Game(True, port, max_steps=250) test(net, env, True)
def main(): is_test = False is_tutorial_world = False port = 4001 if len(sys.argv) > 1: assert(sys.argv[1] in ('master', 'fantasy', 'mastert', 'fantasyt')) is_tutorial_world = sys.argv[1] in ('fantasy', 'fantasyt') is_test = sys.argv[1] in ('mastert', 'fantasyt') if len(sys.argv) > 2: port = int(sys.argv[2]) env = Game(is_tutorial_world, port) net = Net(len(env.symbols), len(env.actions), len(env.objects)) #Try to load from file if os.path.isfile(name): print("Loading from file..") net.load_state_dict(torch.load(name)) if is_test: test(net, env, is_tutorial_world) else: train(net, env)
def play_game(config: MuZeroConfig, network: Network) -> Game: game = Game.from_config(config) while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) last_observation = game.make_image(-1) root.expand(game.to_play(), game.legal_actions(), network.initial_inference(last_observation).numpy()) root.add_exploration_noise(config) # logging.debug('Running MCTS on step {}.'.format(len(game.history))) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the network. run_mcts(config, root, game.action_history(), network) action = root.select_action(config, len(game.history), network) game.apply(action) game.store_search_statistics(root) logging.info('Finished episode at step {} | cumulative reward: {}' \ .format(len(game.obs_history), sum(game.rewards))) return game
self.canvas.delete(self.explosionAV) self.canvas.delete(self.explosionAH) self.canvas.delete(self.explosionBV) self.canvas.delete(self.explosionBH) self.canvas.delete(self.bombA) self.canvas.delete(self.bombB) self.explosionAV = None self.explosionAH = None self.explosionBV = None self.explosionBH = None self.bombA = None self.bombB = None if __name__ == '__main__': env = Game() vis = Visualiser(env, 80) for i in range(100): env.step(random.randint(0, 4), random.randint(0, 4)) vis.update_canvas(env) # env.step(1,1) # vis.update_canvas(env) # env.step(1,1) # vis.update_canvas(env) # env.step(2,1) # vis.update_canvas(env) # env.step(2,1) # vis.update_canvas(env) # env.step(4,1) # vis.update_canvas(env)
plt.xlabel('Episodes') plt.legend(('Q-Learning', 'MinMax Q-Learning')) plt.show() def assign_bins(obs, bins): state = np.zeros(4) for i in range(4): state[i] = np.digitize(obs[i], bins[i]) return state def plotPolicy(player, game): for state in player.Q: print("\n=================") game.draw(game.P) # print("State value: %s" % player.V[state]) player.policyForState(state) if __name__ == '__main__': env = Game() # ql_p = test(True, 'saved_players/QR') # min_p = test_minmax(True, 'MR') # ql_wins, minmax_wins = ql_vs_minmax(False) # print(ql_wins) # print(minmax_wins) # test_rps() # test_DQL() # testB(cont=True, filenames=("actions", "actionsB")) run_optimal() # run_optimalB() # test_cartpole()
import sys, pygame import numpy as np from env import Game pygame.init() field_size = 4 game = Game(field_size) game.rand(2, 2) game.field[0, 0] = 2 game.field[0, 1] = 4 game.field[0, 2] = 8 game.field[0, 3] = 16 size = width, height = 400, 400 screen = pygame.display.set_mode(size) pygame.display.set_caption("2048") bg = pygame.Surface((size)) bg.fill(pygame.Color('#bbada0')) #776e65 - font of numbers indent = 5 box_size = 90 boxes = [] coords = [] for i in range(field_size ** 2): bx = pygame.Surface((box_size, box_size)) bx.fill(pygame.Color("#eee4da")) boxes.append(bx)
if (delta < theta): return delta, Value_table # 收敛了就退出循环 def policy_improvement(env, dealer, player, Value_table): hit = update_value_for_one_state(env, 0, dealer, player, Value_table) stick = update_value_for_one_state(env, 1, dealer, player, Value_table) # 比较两个动作谁的价值更高就选择哪个 if (hit > stick): Policy[dealer - 1, player - 1] = 0 else: Policy[dealer - 1, player - 1] = 1 if __name__ == "__main__": env = Game() initial_policy() result = [] for epi in range(episode_num): # 策略评估 pe, Value_table = policy_evaluation(env, Value_table) print(pe, epi) # 策略提升 policy_stable = True # 如果对所有的状态策略都没有更新,那么提前结束 for dealer in range(1, 11): for player in range(1, 22): old_action = Policy[dealer - 1, player - 1] policy_improvement(env, dealer, player, Value_table) if (old_action != Policy[dealer - 1, player - 1]): policy_stable = False # print(Policy)
def train(net, rank): torch.set_num_threads(1) #also do: export MKL_NUM_THREADS=1 net.reset() env = Game(True, 4000 + rank + 1, max_steps=250) target_net = Net(1254, 6, 36) target_net.load_state_dict(net.state_dict()) target_net.reset() epsilon = epsilon1 optimizer = optim.RMSprop(net.parameters(), lr=learning_rate) last_save = time.time() last_notify = time.time() last_sync = time.time() episode_number = 0 terminal = True prev_value = None available_objects = None num_objects = len(env.objects) recent_rewards_of_episodes = [] recent_steps_of_episodes = [] quest1_reward_cnt = 0 quest2_reward_cnt = 0 quest3_reward_cnt = 0 quest4_reward_cnt = 0 quest1_rewards = np.zeros(100) quest2_rewards = np.zeros(100) quest3_rewards = np.zeros(100) quest4_rewards = np.zeros(100) if rank == 0: stats = [] while True: if terminal: student_saw_obelisk = False quest1_rewards[episode_number % len(quest1_rewards)] = 0 quest2_rewards[episode_number % len(quest2_rewards)] = 0 quest3_rewards[episode_number % len(quest3_rewards)] = 0 quest4_rewards[episode_number % len(quest4_rewards)] = 0 prev_value = None num_steps = 0 net.reset() target_net.reset() state, reward, terminal, available_objects = env.reset() sum_rewards = reward state = torch.LongTensor(state) objects_probs = net(Variable(state.unsqueeze(0))) _objects_probs = objects_probs.data.numpy() #Choose action if random.random() < epsilon: if available_objects is None: objects = list(enumerate(env.objects)) else: objects = [ _ for _ in list(enumerate(env.objects)) if _[0] in available_objects ] _object = random.choice(objects)[0] else: if available_objects is not None: mask = np.zeros(num_objects) for e in available_objects: mask[e] = 1 _objects_probs = objects_probs.data.numpy() * mask _objects_probs = _objects_probs + (_objects_probs == 0) * -1e30 _object = int(np.argmax(_objects_probs)) prev_value = objects_probs[0, _object] # step the environment and get new measurements state, reward, terminal, available_objects = env.step(5, _object) sum_rewards += reward num_steps += 1 if reward > 10 - 0.0001: quest4_reward_cnt = quest4_reward_cnt + 1 quest4_rewards[episode_number % len(quest4_rewards)] = 1 elif reward > 8 - 0.0001: quest3_reward_cnt = quest3_reward_cnt + 1 quest3_rewards[episode_number % len(quest3_rewards)] = 1 if not disable_curriculum: if not student_saw_obelisk: reward = -8 terminal = True elif reward > 7 - 0.0001: student_saw_obelisk = True quest2_reward_cnt = quest2_reward_cnt + 1 quest2_rewards[episode_number % len(quest2_rewards)] = 1 if not disable_curriculum: if np.mean(quest2_rewards) < 0.75 and random.random() < 0.9: terminal = True elif reward > 5 - 0.0001: quest1_reward_cnt = quest1_reward_cnt + 1 quest1_rewards[episode_number % len(quest1_rewards)] = 1 if not disable_curriculum: if np.mean(quest1_rewards) < 0.9 and random.random() < 0.85: terminal = True if 2 * epsilon > (epsilon1 + epsilon2): if np.mean(quest3_rewards) > .98: if np.mean(quest2_rewards) > .98: if np.mean(quest1_rewards) > .98: epsilon = epsilon2 if rank == 0: notify("Epsilon is now:" + str(epsilon)) if terminal: next_value = 0 else: if target_q_ts is None: next_value = float(np.max(_objects_probs)) else: state = torch.LongTensor(state) objects_probs = target_net(Variable(state.unsqueeze(0))) _objects_probs = objects_probs.data.numpy() if available_objects is not None: mask = np.zeros(num_objects) for e in available_objects: mask[e] = 1 _objects_probs = _objects_probs * mask _objects_probs = _objects_probs + (_objects_probs == 0) * -1e30 next_value = float(np.max(_objects_probs)) loss = (reward + gamma * next_value - prev_value)**2 #Update for only a tenth of the non important steps if abs(reward) > 4 or random.random() < 0.05: optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm(net.parameters(), 1) optimizer.step() if terminal: recent_rewards_of_episodes.append(sum_rewards) recent_steps_of_episodes.append(num_steps) if len(recent_rewards_of_episodes) > 100: recent_rewards_of_episodes.pop(0) if len(recent_steps_of_episodes) > 100: recent_steps_of_episodes.pop(0) episode_number += 1 if target_q_ts is not None and time.time( ) - last_sync > target_q_ts: if rank == 0: print("Update target") target_net.load_state_dict(net.state_dict()) last_sync = time.time() if rank == 0: stats.append({}) stats[-1]["episode_number"] = episode_number stats[-1]["sum_rewards"] = sum_rewards stats[-1]["num_steps"] = num_steps stats[-1]["mean_recent_rewards_of_episodes"] = np.mean( recent_rewards_of_episodes) stats[-1]["mean_recent_steps_of_episodes"] = np.mean( recent_steps_of_episodes) stats[-1]["quest1_reward_cnt"] = quest1_reward_cnt stats[-1]["quest2_reward_cnt"] = quest2_reward_cnt stats[-1]["quest3_reward_cnt"] = quest3_reward_cnt stats[-1]["quest4_reward_cnt"] = quest4_reward_cnt stats[-1]["mean_quest1_rewards"] = np.mean(quest1_rewards) stats[-1]["mean_quest2_rewards"] = np.mean(quest2_rewards) stats[-1]["mean_quest3_rewards"] = np.mean(quest3_rewards) stats[-1]["mean_quest4_rewards"] = np.mean(quest4_rewards) summary = "{} {:.4} {} {:.4} {:.4} Qc: {} {} {} {} Q: {} {} {} {}".format( episode_number, sum_rewards, num_steps, np.mean(recent_rewards_of_episodes), np.mean(recent_steps_of_episodes), quest1_reward_cnt, quest2_reward_cnt, quest3_reward_cnt, quest4_reward_cnt, np.mean(quest1_rewards), np.mean(quest2_rewards), np.mean(quest3_rewards), np.mean(quest4_rewards)) print(summary) if save_every is not None: if time.time() - last_save > save_every: print("Saving..") torch.save(net.state_dict(), name) with open(name_stats, "wb") as _fh: pickle.dump(stats, _fh) last_save = time.time() if notify_every is not None: if time.time() - last_notify > notify_every: print("Notify..") notify(summary) last_notify = time.time() if max_episodes is not None and episode_number == max_episodes: torch.save(net.state_dict(), name) with open(name_stats, "wb") as _fh: pickle.dump(stats, _fh) notify(summary) notify("Done.") print("Done.") sys.exit()