def simulateGame(player, opponent): # Returns fitness delta game = ConnectFour() illegal_moves = 0 random_moves_left = 3 while not game.isFinished(): # TODO track stats? if game.isOurTurn(): try: pickAndMakeMove(game, player) except IndexError: illegal_moves += 1 if illegal_moves >= NUMBER_ILLEGAL_MOVES_ALLOWED: # Penalise player return -NUMBER_TO_SAMPLE else: continue if game.isFinished(): break if random_moves_left > 0: pickAndMakeMove(game, agents.RandomAgent()) random_moves_left -= 1 else: try: pickAndMakeMove(game, opponent) except IndexError: pickAndMakeMove(game, agents.RandomAgent()) else: # Not our turn if random_moves_left > 0: pickAndMakeMove(game, agents.RandomAgent()) random_moves_left -= 1 else: try: pickAndMakeMove(game, opponent) except IndexError: pickAndMakeMove(game, agents.RandomAgent()) if game.isFinished(): break try: pickAndMakeMove(game, player) except IndexError: illegal_moves += 1 if illegal_moves >= NUMBER_ILLEGAL_MOVES_ALLOWED: # Penalise player return -NUMBER_TO_SAMPLE else: continue # Game is finished (or illegal move made) # TODO debug prints, or stats return game.score()
def __init__(self, env, feature_mapper): # Same mapper self.feature_mapper = feature_mapper # Play one episode to generate data for scaler random_agent = agents.RandomAgent(env.n_stocks) X = [] done = False state = env.reset() X.append(self.feature_mapper(state)) while not done: action = random_agent.get_action(state) next_state, reward, done, info = env.step(action) state = next_state.copy() X.append(self.feature_mapper(state)) # Create and fit scaler self.scaler = StandardScaler() self.scaler.fit(X) # Record metadata self.size = len(X[0])
def main(): agent = agents.RandomAgent() task = marioai.Task() exp = marioai.Experiment(task, agent) exp.max_fps = 24 task.env.level_type = 0 exp.doEpisodes()
def __init__(self, env, monitor='output/', seed=None): self.env = env self.agents = {'universe': agents.A3C(env, monitor+'universe/', CHECKPOINTS+'/universe/'+env+'/', 1), 'tensorpack': agents.TPAgent(env, monitor+'tensorpack/', CHECKPOINTS+'/tensorpack/'+env+'/'+env, 1), 'random': agents.RandomAgent()} self.seed = seed self.best = ''
def next_turn(action): agent = agents.RandomAgent() if agent.make_move(1, action) is None: return (action, 0, True) else: move = agent.make_move(-1, action) if move is None: return (action, 1, True) else: return (move, 0, False)
def get_agent(args, env): """Get agent by name""" if args.agent == 'random': agent = agents.RandomAgent(env.action_space.n) elif args.agent == 'mfec': agent = agents.MFECAgent(env.action_space.n, 84*84, args.logdir, hash_bits=args.hash_bits, projection_size=args.projection_size, epsilon_steps=args.epsilon_steps) else: raise ValueError('unknown agent: {}'.format(args.agent)) return agent
def generate_training_data(episodes): env = gym.make('CartPole-v0') env._max_episode_steps = game_lib.TIMESTEPS agent = agents.RandomAgent(env) start = time.time() game_lib.play(env, agent, episodes=episodes, score_threshold=80, render=False, save_training_data=True) print('Elapsed time: {t} seconds'.format(t=round(time.time() - start, 4))) env.close()
def main(): env = gym.make('gym_mmab:mmab-v0', n_players=3, n_arms=5) # env = gym.make('gym_mmab:mmab-v0') # If you leave arguments empty, default: n_players=3, n_arms=10 agent0_candidates = [agents.Agent(0, env.n_arms), agents.RandomAgent(0, env.n_arms), agents.QAgent(0, env.n_arms)] saved_obs_history = [] for agent0 in agent0_candidates: print("====================") print(f"Testing with {type(agent0).__name__}") print("====================") obs_history = test_selectedagent(env, agent0) saved_obs_history.append(obs_history) print() print("====================") print("Comparing cumulative rewards in last 100 rounds") for n, agent0 in enumerate(agent0_candidates): tot_rewards = np.array(saved_obs_history[n][-100:]).sum(axis=0)[0] print(f"{type(agent0).__name__:>14}: {tot_rewards}") print()
def test_agents(): """Tests for the stock agents.""" result = True game = ttt.TicTacToe(3, 3) random = ag.RandomAgent('Random', game) dumb = ag.DumbAgent('Dumb', game) optimal = ag.MinimaxAgent('Minimax', game) # Smoke test for the random agent. game.take_action(random.select_move()) game.undo_action() result = result and expect_equal(dumb.select_move(), 0, 'dumb first move') game.take_action(0) result = result and expect_equal(dumb.select_move(), 1, 'dumb second move') game.take_action(4) game.take_action(3) result = result and expect_equal(optimal.select_move(), 6, 'minimax move') result = result and expect_equal(optimal.optimal_moves(), [6], 'minimax optimal moves') game.reset() # result = result and expect_equal(optimal.optimal_moves(), range(9), 'minimax all moves optimal') test_result(result, 'Agents Test')
def main(): game = ttt.TicTacToe(8, 4) random = ag.RandomAgent('Random', game) config = rl.Config( training_epochs=2, games_per_epoch=10, rollouts_per_move=20, rollout_depth=4, rollout_policy=functools.partial(po.alpha_zero_mcts_policy, c_puct=10.0), play_policy=functools.partial(po.alpha_zero_play_policy, tau=1.5), inference_policy=po.greedy_prior_policy, opponent_rollout_policy=None, opponent_play_policy=None, policy_target=functools.partial(po.alpha_zero_visit_counts_to_target, action_space=game.action_space(), tau=1.0), inference_rollouts_per_move=40, inference_rollout_depth=4) model = mo.KerasModel(game, [128], [64, 32], [16, 4], data_passes=100) agent = rl.RLAgent('RL Agent', game, model, config, [random], 100) agent.train(print_progress=True) gl.play_match(g=game, agent_a=agent, agent_b=random, num_games=4) gl.interactive_play(game, agent)
type=str, default=None, help="weights files, only valid for --agent=neural") parser.add_argument( '--lite-weights', type=str, default=None, help="tf lite weights files, must be set for --agent=neural_lite") parser.add_argument('--trials', type=int, default=10, help='num trials to run; new agent per trial') opts = parser.parse_args() evaluator = cartpole_fitness.CartPoleFitness(render=opts.env_render) print("trial\ttotal_reward") for trial_idx in range(opts.trials): if opts.agent == 'random': agent = agents.RandomAgent() elif opts.agent == 'neural': agent = agents.NeuralAgent() if opts.weights is not None: agent.set_weights_of_model(np.load(opts.weights)) elif opts.agent == 'neural_lite': agent = agents.NeuralLiteAgent(tflite_file=opts.lite_weights) else: raise Exception("unexpected agent type [%s]" % opts.agent) print("%d\t%d" % (trial_idx, evaluator.fitness(agent))) sys.stdout.flush()
import pickle import matplotlib.pyplot as plt test_p = pickle.load(open("agentdata/X1QLAGENT_GAMES_100007-19-46-16.p", "rb")) from spades import Spades from spades import run_x_games_and_pickle import spades import agents agent_25k = pickle.load( open("agentdata/X2QLAGENT_GAMES_250007-20-50-0.p", "rb")) agent_10k = pickle.load(open("agentdata/QLAGENT_GAMES_100007-21-1-43.p", "rb")) if __name__ == "__main__": ql = agents.QLearningAgent("test", epsilon=0) agent_10k.epsilon = 0 players = [agent_10k, agents.RandomAgent(2)] run_x_games_and_pickle(players, 2000)
import agents import pong_env import pygame import matplotlib.pyplot as plt from datetime import datetime #get datetime data for file name now = datetime.now() player_random = agents.RandomAgent(3) player_pg = agents.PolicyGradientAgent(5, 3) ''' #player pg uses saved model model = "pg_model_1_3" player_pg.from_load_model(model) print("Model loaded.") ''' num_play = 2 clock = pygame.time.Clock() scores = [] break_learning = False display = True for i in range(num_play): done = False score = 0 game = pong_env.Pong() while not done: if display:
def train(self, no_scenarios, print_log, plot_graphs, save_graphs, collect_comparison=False): stats = { 'scenarios': [], 'rewards': [], 'durations': [], 'detected': [], 'missed': [], 'ttf': [], 'napfd': [], 'recall': [], 'avg_precision': [], 'result': [], 'step': [], 'env': self.scenario_provider.name, 'agent': self.agent.name, 'action_size': self.agent.action_size, 'history_length': self.agent.histlen, 'rewardfun': self.reward_function.__name__, 'sched_time': self.scenario_provider.avail_time_ratio, 'hidden_size': 'x'.join(str(x) for x in self.agent.hidden_size) if hasattr(self.agent, 'hidden_size') else 0 } if collect_comparison: cmp_agents = { 'heur_sort': agents.HeuristicSortAgent(self.agent.histlen), 'heur_weight': agents.HeuristicWeightAgent(self.agent.histlen), 'heur_random': agents.RandomAgent(self.agent.histlen) } stats['comparison'] = {} for key in cmp_agents.keys(): stats['comparison'][key] = { 'detected': [], 'missed': [], 'ttf': [], 'napfd': [], 'recall': [], 'avg_precision': [], 'durations': [] } sum_actions = 0 sum_scenarios = 0 sum_detected = 0 sum_missed = 0 sum_reward = 0 for (i, sc) in enumerate(self.scenario_provider, start=1): if i > no_scenarios: break start = time.time() if print_log: print('ep %d:\tscenario %s\t' % (sum_scenarios + 1, sc.name), end='') (result, reward) = self.process_scenario(sc) end = time.time() # Statistics sum_detected += result[0] sum_missed += result[1] sum_reward += np.mean(reward) sum_actions += 1 sum_scenarios += 1 duration = end - start stats['scenarios'].append(sc.name) stats['rewards'].append(np.mean(reward)) stats['durations'].append(duration) stats['detected'].append(result[0]) stats['missed'].append(result[1]) stats['ttf'].append(result[2]) stats['napfd'].append(result[3]) stats['recall'].append(result[4]) stats['avg_precision'].append(result[5]) stats['result'].append(result) stats['step'].append(sum_scenarios) if print_log: print(' finished, reward: %.2f,\trunning mean: %.4f,\tduration: %.1f,\tresult: %s' % (np.mean(reward), sum_reward / sum_scenarios, duration, result)) if collect_comparison: for key in stats['comparison'].keys(): start = time.time() cmp_res = process_scenario(cmp_agents[key], sc, preprocess_discrete) end = time.time() stats['comparison'][key]['detected'].append(cmp_res[0]) stats['comparison'][key]['missed'].append(cmp_res[1]) stats['comparison'][key]['ttf'].append(cmp_res[2]) stats['comparison'][key]['napfd'].append(cmp_res[3]) stats['comparison'][key]['recall'].append(cmp_res[4]) stats['comparison'][key]['avg_precision'].append(cmp_res[5]) stats['comparison'][key]['durations'].append(end - start) # Data Dumping if self.dump_interval > 0 and sum_scenarios % self.dump_interval == 0: pickle.dump(stats, open(self.stats_file + '.p', 'wb')) if self.validation_interval > 0 and (sum_scenarios == 1 or sum_scenarios % self.validation_interval == 0): if print_log: print('ep %d:\tRun test... ' % sum_scenarios, end='') self.run_validation(sum_scenarios) pickle.dump(self.validation_res, open(self.val_file + '.p', 'wb')) if print_log: print('done') if self.dump_interval > 0: self.agent.save(self.agent_file) pickle.dump(stats, open(self.stats_file + '.p', 'wb')) if plot_graphs: plot_stats.plot_stats_single_figure(self.file_prefix, self.stats_file + '.p', self.val_file + '.p', 1, plot_graphs=plot_graphs, save_graphs=save_graphs) if save_graphs: plot_stats.plot_stats_separate_figures(self.file_prefix, self.stats_file + '.p', self.val_file + '.p', 1, plot_graphs=False, save_graphs=save_graphs) return np.mean(stats['napfd'])
plt.ylabel("Cumulative Reward") plt.show() def plotQ(Q): states = [[0, 0], [0, 1], [1, 0], [1, 1]] for state in states: for a in [0, 1]: print("Q[{},{}]={}".format(state, a, Q[env.asint(state), a])) # Number of iterations n_iter = 1000 # environment specs env = EvidenceEnv(n=2, p=0.75) agent = agents.RandomAgent(env) runAgent() # define agent agent = agents.TabularQAgent(env) plotQ(agent.Q) runAgent() plotQ(agent.Q) actualQ = agent.Q agent = agents.NeuralAgent(env, actualQ) plotQ(agent.Q) runAgent() plotQ(agent.Q)
import pickle import matplotlib.pyplot as plt test_p = pickle.load(open("agentdata/QLAGENT_GAMES_100007-18-2-16.p", "rb")) from spades import Spades from spades import run_x_games_and_pickle import spades import agents if __name__ == "__main__": players = [agents.QLearningAgent(1), agents.RandomAgent(4)] run_x_games_and_pickle(players, 100000)
def main(): game = ck.Checkers(8, 3) random = ag.RandomAgent('Random', game) print game.action_space() gl.interactive_play(game, random)
def set_agents(self, model_path_a, model_path_b, model_path_m): if model_path_a == 'human' or model_path_b == 'human': game_mode = 'pygame' else: game_mode = 'text' self.env = game.GameState(game_mode) if model_path_a == 'random': print('load player model:', model_path_a) self.player = agents.RandomAgent(BOARD_SIZE) elif model_path_a == 'puct': print('load player model:', model_path_a) self.player = agents.PUCTAgent(BOARD_SIZE, N_MCTS_PLAYER) elif model_path_a == 'uct': print('load player model:', model_path_a) self.player = agents.UCTAgent(BOARD_SIZE, N_MCTS_PLAYER) elif model_path_a == 'human': print('load player model:', model_path_a) self.player = agents.HumanAgent(BOARD_SIZE, self.env) elif model_path_a == 'web': print('load player model:', model_path_a) self.player = agents.WebAgent(BOARD_SIZE) else: print('load player model:', model_path_a) self.player = agents.ZeroAgent(BOARD_SIZE, N_MCTS_PLAYER, IN_PLANES_PLAYER, noise=False) self.player.model = model.PVNet(N_BLOCKS_PLAYER, IN_PLANES_PLAYER, OUT_PLANES_PLAYER, BOARD_SIZE).to(device) state_a = self.player.model.state_dict() my_state_a = torch.load( model_path_a, map_location='cuda:0' if use_cuda else 'cpu') for k, v in my_state_a.items(): if k in state_a: state_a[k] = v self.player.model.load_state_dict(state_a) if model_path_b == 'random': print('load enemy model:', model_path_b) self.enemy = agents.RandomAgent(BOARD_SIZE) elif model_path_b == 'puct': print('load enemy model:', model_path_b) self.enemy = agents.PUCTAgent(BOARD_SIZE, N_MCTS_ENEMY) elif model_path_b == 'uct': print('load enemy model:', model_path_b) self.enemy = agents.UCTAgent(BOARD_SIZE, N_MCTS_ENEMY) elif model_path_b == 'human': print('load enemy model:', model_path_b) self.enemy = agents.HumanAgent(BOARD_SIZE, self.env) elif model_path_b == 'web': print('load enemy model:', model_path_b) self.enemy = agents.WebAgent(BOARD_SIZE) else: print('load enemy model:', model_path_b) self.enemy = agents.ZeroAgent(BOARD_SIZE, N_MCTS_ENEMY, IN_PLANES_ENEMY, noise=False) self.enemy.model = model.PVNet(N_BLOCKS_ENEMY, IN_PLANES_ENEMY, OUT_PLANES_ENEMY, BOARD_SIZE).to(device) state_b = self.enemy.model.state_dict() my_state_b = torch.load( model_path_b, map_location='cuda:0' if use_cuda else 'cpu') for k, v in my_state_b.items(): if k in state_b: state_b[k] = v self.enemy.model.load_state_dict(state_b) # monitor agent self.monitor = agents.ZeroAgent(BOARD_SIZE, N_MCTS_MONITOR, IN_PLANES_ENEMY, noise=False) self.monitor.model = model.PVNet(N_BLOCKS_ENEMY, IN_PLANES_ENEMY, OUT_PLANES_ENEMY, BOARD_SIZE).to(device) state_b = self.monitor.model.state_dict() my_state_b = torch.load( model_path_m, map_location='cuda:0' if use_cuda else 'cpu') for k, v in my_state_b.items(): if k in state_b: state_b[k] = v self.monitor.model.load_state_dict(state_b)
import models from helpers import play_game import numpy as np INITIAL_INVESTMENT = 20000.0 train_env = Stockmarket.StockMarket("train", INITIAL_INVESTMENT) def identity(x): return (x) feature_generator = models.FeatureGenerator(train_env, identity) random_agent = agents.RandomAgent(train_env.n_stocks) linear_agent = agents.LinearAgent(train_env.n_stocks, feature_generator, gamma=0.95, epsilon_decay=0.9995, epsilon_min=0.01, alpha=0.01, momentum=0.9) print(play_game(train_env, random_agent)) # Train print("Training [LinearAgent, Random]") N = 50 # Should really do about 200 val = np.zeros((2, N)) for i in range(N):
histlen=args.histlen) ## If the action size is 1 , we use MLPClassifier ## for action size as 2, we use MLPRegressor elif args.agent == 'network': if args.reward in ('binary'): action_size = 1 else: action_size = 2 agent = agents.NetworkAgent(state_size=state_size, action_size=action_size, hidden_size=args.hiddennet, histlen=args.histlen) elif args.agent == 'heur_random': agent = agents.RandomAgent(histlen=args.histlen) elif args.agent == 'heur_sort': agent = agents.HeuristicSortAgent(histlen=args.histlen) elif args.agent == 'heur_weight': agent = agents.HeuristicWeightAgent(histlen=args.histlen) else: print('Unknown Agent') sys.exit() if args.scenario_provider == 'random': scenario_provider = scenarios.RandomScenarioProvider() elif args.scenario_provider == 'incremental': scenario_provider = scenarios.IncrementalScenarioProvider( episode_length=args.no_scenarios) elif args.scenario_provider == 'paintcontrol': scenario_provider = scenarios.IndustrialDatasetScenarioProvider(
def train_visual_module(img_width, img_height, pc_ensemble, hd_ensemble, lab_config, level, level_boundary_min, level_boundary_max): assert str(img_width) == lab_config[ 'width'], "DM-Lab camera width does not match the width of the visual module" assert str(img_height) == lab_config[ 'height'], "DM-Lab camera height does not match the height of the visual module" model = networks.VisualModule(img_width, img_height, pc_ensemble, hd_ensemble) # model = tf.keras.models.Sequential([ # tf.keras.layers.Dense(32, input_shape=(64,64,3)) # ]) # Prepare env and random agent observations = ['RGB', 'DEBUG.POS.ROT', 'DEBUG.POS.TRANS'] env = deepmind_lab.Lab(level, observations, config=lab_config, renderer='software') agent = agents.RandomAgent(env.action_spec(), forbidden_actions=[ 'JUMP', 'FIRE', 'CROUCH', 'LOOK_DOWN_UP_PIXELS_PER_FRAME' ]) episode_length = 100 total_frames = episode_length * 1e6 batch_size = 32 epochs = 1000 training_steps_per_epoch = total_frames // batch_size # Record training data def generate_batch(): replay_buffer = buffers.ReplayBuffer(batch_size * episode_length) # pc_boundary_= (pc_ensemble.pos_max - pc_ensemble.pos_min) pc_boundary_scale = (pc_ensemble.pos_max - pc_ensemble.pos_min) / \ (level_boundary_max - level_boundary_min) level_boundary_mean = (level_boundary_max - level_boundary_min) / 2. + level_boundary_min while True: env.reset() # Collect observations for _ in range(batch_size): for _ in range(episode_length): if not env.is_running(): print('Environment stopped early') env.reset() agent.reset() obs = env.observations() if not obs: raise Exception('Observations empty!') # Normalize observations rgb = obs['RGB'] target_pos = obs['DEBUG.POS.TRANS'][:2] target_rot = obs['DEBUG.POS.ROT'][1] target_pos -= level_boundary_mean target_pos *= pc_boundary_scale target_rot = target_rot * ((2. * np.pi) / 360.) replay_buffer.add([rgb, target_pos, target_rot]) action = agent.step() env.step(action, num_steps=1) # Form batches # TODO make sure that the replay buffer is actually filled (no early environment stoppings, see above) target_pos_batch = np.zeros((batch_size, episode_length, 2)) target_rot_batch = np.zeros((batch_size, episode_length, 1)) rgb_batch = np.zeros( (batch_size, episode_length, 3, img_width, img_height)) for i in range(batch_size): sampled_obs = replay_buffer.sample(episode_length) target_pos_batch[i, :, :] = np.array( list(map(lambda x: x[1], sampled_obs))) target_rot_batch[i, :, 0] = np.array( list(map(lambda x: x[2], sampled_obs))) rgb_batch[i, :, :, :, :] = np.array( list(map(lambda x: x[0], sampled_obs))) replay_buffer.clear() targets = utils.encode_targets(target_pos_batch, target_rot_batch, [pc_ensemble], [hd_ensemble]) # Compute training targets rgb_batch = np.swapaxes(rgb_batch, 2, 4) rgb_batch = np.swapaxes(rgb_batch, 2, 3) rgb_batch = tf.convert_to_tensor(rgb_batch) yield (rgb_batch), (targets[0], targets[1]) # Prepare model training model.compile(optimizer=tf.optimizers.RMSprop(learning_rate=1e-5, momentum=0.9, clipvalue=1e-5), loss={ 'output_1': softmax_cross_entropy_logits_loss, 'output_2': softmax_cross_entropy_logits_loss }) # batch_generator = generate_batch(batch_size=10) # import ipdb; ipdb.set_trace() # for _ in range(epochs): # for _ in range(training_steps_per_epoch): # x, y = next(batch_generator) # pc_pred, hd_pred = model.train_on_batch(x, y) # model.fit_generator( # generate_batch(), # epochs=epochs, # steps_per_epoch=training_steps_per_epoch, # verbose=1, # ) model.fit( generate_batch(), epochs=epochs, steps_per_epoch=training_steps_per_epoch, verbose=1, )
def set_agents(self, model_path_a, model_path_b, model_path_m): # 플레이어 중 human이 있으면 pygame창에서 게임 실행, 아니면 텍스트만 출력 if model_path_a == 'human' or model_path_b == 'human': game_mode = 'pygame' else: game_mode = 'text' # env파일의 gamemode 설정 self.env = game.GameState(game_mode) # 플레이어의 모델 설정 (human) if model_path_a == 'random': print('load player model:', model_path_a) self.player = agents.RandomAgent(BOARD_SIZE) elif model_path_a == 'puct': print('load player model:', model_path_a) self.player = agents.PUCTAgent(BOARD_SIZE, N_MCTS_PLAYER) elif model_path_a == 'uct': print('load player model:', model_path_a) self.player = agents.UCTAgent(BOARD_SIZE, N_MCTS_PLAYER) elif model_path_a == 'human': print('load player model:', model_path_a) self.player = agents.HumanAgent(BOARD_SIZE, self.env) elif model_path_a == 'web': print('load player model:', model_path_a) self.player = agents.WebAgent(BOARD_SIZE) else: print('load player model:', model_path_a) self.player = agents.ZeroAgent(BOARD_SIZE, N_MCTS_PLAYER, IN_PLANES_PLAYER, noise=False) self.player.model = model.PVNet(N_BLOCKS_PLAYER, IN_PLANES_PLAYER, OUT_PLANES_PLAYER, BOARD_SIZE).to(device) state_a = self.player.model.state_dict() my_state_a = torch.load( model_path_a, map_location='cuda:0' if use_cuda else 'cpu') for k, v in my_state_a.items(): if k in state_a: state_a[k] = v self.player.model.load_state_dict(state_a) # 적 플레이어의 모델 설정 ( if model_path_b == 'random': print('load enemy model:', model_path_b) self.enemy = agents.RandomAgent(BOARD_SIZE) elif model_path_b == 'puct': print('load enemy model:', model_path_b) self.enemy = agents.PUCTAgent(BOARD_SIZE, N_MCTS_ENEMY) elif model_path_b == 'uct': print('load enemy model:', model_path_b) self.enemy = agents.UCTAgent(BOARD_SIZE, N_MCTS_ENEMY) elif model_path_b == 'human': print('load enemy model:', model_path_b) self.enemy = agents.HumanAgent(BOARD_SIZE, self.env) elif model_path_b == 'web': print('load enemy model:', model_path_b) self.enemy = agents.WebAgent(BOARD_SIZE) else: # 이미 만들어진 데이터를 사용할땐 이 부분이 실행됨 print('load enemy model:', model_path_b) # 적 에이전트 설정 self.enemy = agents.ZeroAgent(BOARD_SIZE, N_MCTS_ENEMY, IN_PLANES_ENEMY, noise=False) # 적 신경망 모델 설정 및 device(GPU)로 불러와 agents.ZeroAgent().model에 저장 self.enemy.model = model.PVNet(N_BLOCKS_ENEMY, IN_PLANES_ENEMY, OUT_PLANES_ENEMY, BOARD_SIZE).to(device) state_b = self.enemy.model.state_dict() # dict형식의 신경망 파라미터의 텐서 my_state_b = torch.load(model_path_b, map_location='cuda:0' if use_cuda else 'cpu') # 저장한 파라미터 파일을 불러옴 # state_b에는 키 값으로 여러 레이어의 weight, bias 등과 그에 해당하는 value들이 저장됨 for k, v in my_state_b.items(): if k in state_b: state_b[k] = v self.enemy.model.load_state_dict(state_b) # 딥러닝 모델에 파라미터 설정 # monitor agent 위와 동일 self.monitor = agents.ZeroAgent(BOARD_SIZE, N_MCTS_MONITOR, IN_PLANES_ENEMY, noise=False) self.monitor.model = model.PVNet(N_BLOCKS_ENEMY, IN_PLANES_ENEMY, OUT_PLANES_ENEMY, BOARD_SIZE).to(device) state_b = self.monitor.model.state_dict() my_state_b = torch.load(model_path_m, map_location='cuda:0' if use_cuda else 'cpu') for k, v in my_state_b.items(): if k in state_b: state_b[k] = v self.monitor.model.load_state_dict(state_b)
def train(self, no_scenarios, print_log, plot_graphs, save_graphs, collect_comparison=True): # Stats is the Dictionary of this object which has various useful fields mentioned below stats = { 'scenarios': [], 'rewards': [], 'durations': [], 'detected': [], 'missed': [], 'ttf': [], 'napfd': [], 'recall': [], 'avg_precision': [], 'result': [], 'step': [], 'env': self.scenario_provider.name, 'agent': self.agent.name, # 'action_size': self.agent.action_size, 'history_length': self.agent.histlen, 'rewardfun': self.reward_function.__name__, 'sched_time': self.scenario_provider.avail_time_ratio, 'hidden_size': 'x'.join(str(x) for x in self.agent.hidden_size) if hasattr( self.agent, 'hidden_size') else 0 } if collect_comparison: cmp_agents = { 'heur_sort': agents.HeuristicSortAgent(self.agent.histlen), 'heur_weight': agents.HeuristicWeightAgent(self.agent.histlen), 'heur_random': agents.RandomAgent(self.agent.histlen) } stats['comparison'] = {} # stats['comparison']['heur_sort/heur_weight/rand'] initialized for comparison for key in cmp_agents.keys(): stats['comparison'][key] = { 'detected': [], 'missed': [], 'ttf': [], 'napfd': [], 'recall': [], 'avg_precision': [], 'durations': [] } sum_actions = 0 sum_scenarios = 0 sum_detected = 0 sum_missed = 0 sum_reward = 0 # Enumerate forms a tuple of (count,Element) # write_file.write("Agent is "+str(self.agent)) for (i, sc) in enumerate(self.scenario_provider, start=1): if i > no_scenarios: break start = time.time() if print_log: print('ep %d:\tscenario %s\t' % (sum_scenarios + 1, sc.name), end='') (result, reward) = self.process_scenario(sc) end = time.time() # Statistics of the CI cycle after Prioritization and Selection of test cases from the test suite sum_detected += result[0] sum_missed += result[1] # In future if we want to include the priority of the test cases also, np.average() could be used, where weights can be adjusted depending # upon the priority of the test cases sum_reward += np.mean(reward) sum_actions += 1 sum_scenarios += 1 duration = end - start stats['scenarios'].append(sc.name) stats['rewards'].append(np.mean(reward)) stats['durations'].append(duration) stats['detected'].append(result[0]) stats['missed'].append(result[1]) ## TTF is the time to failure or in simple words, it is the position at which the first test case that failed was placed by our algorithm stats['ttf'].append(result[2]) stats['napfd'].append(result[3]) stats['recall'].append(result[4]) stats['avg_precision'].append(result[5]) stats['result'].append(result) stats['step'].append(sum_scenarios) if print_log: print( ' finished, reward: %.2f,\trunning mean: %.4f,\tduration: %.1f,\tresult: %s' % (np.mean(reward), sum_reward / sum_scenarios, duration, result)) global total_failures_detected global total_failures_missed total_failures_detected += result[0] total_failures_missed += result[1] # Collect Comparison becomes True if we set args.comparable as True ## Formulates the results of the heur_sort, heur_random and heur_weight . if collect_comparison: for key in stats['comparison'].keys(): start = time.time() cmp_res = process_scenario(cmp_agents[key], sc, preprocess_discrete) end = time.time() stats['comparison'][key]['detected'].append(cmp_res[0]) stats['comparison'][key]['missed'].append(cmp_res[1]) stats['comparison'][key]['ttf'].append(cmp_res[2]) stats['comparison'][key]['napfd'].append(cmp_res[3]) stats['comparison'][key]['recall'].append(cmp_res[4]) stats['comparison'][key]['avg_precision'].append( cmp_res[5]) stats['comparison'][key]['durations'].append(end - start) # # Data Dumping ## The below two commented lines of code write the stats into the file after certain interval. No specific need of this, because anyways, we are writing the entire stats ## at the end of the program # if self.dump_interval > 0 and sum_scenarios % self.dump_interval == 0: # pickle.dump(stats, open(self.stats_file + '.p', 'wb')) if self.validation_interval > 0 and ( sum_scenarios == 1 or sum_scenarios % self.validation_interval == 0): if print_log: print('ep %d:\tRun test... ' % sum_scenarios, end='') self.run_validation(sum_scenarios) pickle.dump(self.validation_res, open(self.val_file + '.p', 'wb')) if print_log: print('done') ## Dumping the Stats of all the CI Cycles into the Stats_File if self.dump_interval > 0: self.agent.save(self.agent_file) pickle.dump(stats, open(self.stats_file + '.p', 'wb')) ## Plotting Graphs if plot_graphs: plot_stats.plot_stats_single_figure(self.file_prefix, self.stats_file + '.p', self.val_file + '.p', 1, plot_graphs=plot_graphs, save_graphs=save_graphs) ## Save the generated Graphs if save_graphs: plot_stats.plot_stats_separate_figures(self.file_prefix, self.stats_file + '.p', self.val_file + '.p', 1, plot_graphs=False, save_graphs=save_graphs) return np.mean(stats['napfd']), np.mean(stats['recall'])
import os import sys from game import ConnectFour import agents from utilities import pickMove, pickAndMakeMove game = ConnectFour() if len(sys.argv) >= 2: print("Using opponent from {}".format(sys.argv[1])) with open(sys.argv[1], 'rb') as output: opponent = pickle.load(output) else: print("Using random agent") opponent = agents.RandomAgent() while not game.isFinished(): if game.isOurTurn(): print(game) possibles = game.possibleMoves() column = input("Which column {}? ".format(possibles)) try: game.playMove(int(column)) except Exception as e: print("error occurred", e) else: try: pickAndMakeMove(game, opponent) except Exception: print("AI chose invalid move, trying random")