def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) result_name1, writer1, net1, tgt_net1, selector1, epsilon_tracker1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0) result_name2, writer2, net2, tgt_net2, selector2, epsilon_tracker2, agent2, exp_source2, buffer2, optimizer2 = main.make_components(args, params, device, env, 1) frame = 0 frame_idx1 = 0 frame_idx2 = 0 eval_states1 = None eval_states2 = None date_time = datetime.now().strftime("%b%d_%H-%M-%S") with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1, \ common.RewardTracker(writer2, params['stop_reward_player2'], net2, date_time + result_name2 + ".dat", 1, env) as reward_tracker2: # fill histories main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1) main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2) while True: if frame // args.units % 2 == 0: frame_idx1 += 1 if main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1): break else: frame_idx2 += 1 if main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2): break frame += 1 if args.maxFrames > 0 and frame_idx1 > args.maxFrames: break
def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) result_name1, writer1, net1, tgt_net1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0) net2 = ptan.agent.TargetNet(net1) agent2 = ptan.agent.DQNAgent(lambda x: net1.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) frame = 0 frame_idx1 = 0 date_time = datetime.now().strftime("%b%d_%H-%M-%S") with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1: # fill history main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1) while True: if frame // args.units % 2 == 0: state, _, _, _ = env.step((1, -1)) action, _ = agent2([state]) state, reward, done, _ = env.step((1, action[0])) if done: state = env.reset() else: frame_idx1 += 1 if main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1): break if args.maxFrames > 0 and frame_idx1 > args.maxFrames: break frame += 1 if frame % NET_SYNC == 0: net2.sync()
def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) net1 = dqn_model.RainbowDQN(env.observation_space.shape, env.action_space.n) net1.load_state_dict( torch.load(args.model1, map_location=lambda storage, loc: storage)) agent1 = ptan.agent.DQNAgent(lambda x: net1.qvals(x), ptan.actions.ArgmaxActionSelector(), device=torch.device("cpu")) result_name = "-" + "-rainbow" + "-scenario=" + args.scenario + "-units=" + str( args.units) writer1 = SummaryWriter(comment=result_name + "-player0") env.reset() total_reward1 = 0.0 counter1 = collections.Counter() epsilon = 0.02 frame_idx1 = 0 with common.RewardTracker(writer1, 100, net1, "x.dat", 0, env) as reward_tracker1: while True: frame_idx1 += 1 if np.random.random() < epsilon: action = [env.action_space.sample()] else: state, _, _, _ = env.step((0, -1)) action, _ = agent1([state], [None]) counter1[action[0]] += 1 _, reward, done, _ = env.step((0, action[0])) total_reward1 += reward if done: reward_tracker1.reward(total_reward1, frame_idx1) total_reward1 = 0.0 env.reset() net1.load_state_dict( torch.load(args.model1, map_location=lambda storage, loc: storage)) if args.maxFrames > 0 and frame_idx1 > args.maxFrames: break
def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) result_name, writer, net, tgt_net, agent, exp_source, buffer, optimizer = main.make_components( args, params, device, env, 0) frame_idx = 0 date_time = datetime.now().strftime("%b%d_%H-%M-%S") with common.RewardTracker(writer, params['stop_reward_player1'], net, date_time + result_name + ".dat", 0, env) as reward_tracker: while True: frame_idx += 1 if main.train(params, buffer, device, frame_idx, exp_source, reward_tracker, optimizer, net, tgt_net, writer): break if args.maxFrames > 0 and frame_idx > args.maxFrames: break