def policy_iteration(env, policy, epsilon): q = init_state_action_map(env) visits_map = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, policy) on_policy_evaluation(episode, q, visits_map) epsilon_greedy_policy_improvement(env, episode, q, policy, epsilon) return q
def policy_iteration(env, target_policy, behavior_policy): q = init_state_action_map(env) c = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, behavior_policy) off_policy_evaluation(episode, q, c, target_policy, behavior_policy) greedy_stochastic_policy_improvement(env, episode, q, target_policy) return q
def main(): env = Blackjack() policy = init_policy(env) v = init_state_map(env) visits_map = init_state_map(env) for _ in xrange(20000): episode = generate_episode(env, policy) on_policy_state_evaluation(episode, v, visits_map) env.visualize_state_value(v)
def main(): env = Blackjack() target_policy = init_policy(env) behavior_policy = init_equiprobable_random_policy(env) q = init_state_action_map(env) c = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, behavior_policy) off_policy_evaluation(episode, q, c, target_policy, behavior_policy) env.visualize_action_value(q)
def play_from_file(filename): model = ForwardModel(input_shape=13, n_actions=7) model.load_state_dict(torch.load(filename)) team_red = [PGAgent(2, "red", model), PGAgent(3, "red", model)] team_blue = [Agent(0, "blue"), Agent(1, "blue")] agents = team_blue + team_red env = Environment(agents) _ = generate_episode(env, args, render=True)
def policy_iteration2(env, target_policy, behavior_policy): q = init_state_action_map(env) c = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, behavior_policy) fine_grained_off_policy_iteration(episode, q, c, target_policy, behavior_policy, gamma=1) return q
def test_transferability(args, filename): team_blue = [Agent(idx, "blue") for idx in range(args.n_friends)] team_red = [ PGAgent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) if args.env_type == 'restricted': env = Environment2(agents, args) args.n_actions = 6 + args.n_enemies args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies model = ForwardModel(input_shape=args.n_inputs, n_actions=args.n_actions) model.load_state_dict(torch.load(args.path + filename)) model.eval() for agent in team_red: agent.set_model(model) epi_len, nwins = 0, 0 n_episodes = 0 for step_idx in range(40): batch = [] for _ in range(args.n_episodes_per_step): episode = generate_episode(env) n_episodes += 1 batch.extend(episode) epi_len += len(episode) reward = episode[-1].rewards["blue"] ex.log_scalar('length', len(episode)) ex.log_scalar('reward', reward) ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1)) ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1)) if episode[-1].rewards["blue"] == 1: nwins += 1 s = f"Step {step_idx}: " s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - " s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - " print(s) epi_len, nwins = 0, 0
def train(args): team_blue = [IQLAgent(idx, "blue") for idx in range(args.n_friends)] team_red = [ Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) if args.env_type == 'restricted': env = Environment2(agents, args) args.n_actions = 6 + args.n_enemies # 6 fixed actions + 1 aim action per enemy args.n_inputs = 4 + 3 * ( args.n_friends - 1) + 3 * args.n_enemies # see process function in models.py models = generate_models(args.n_inputs, args.n_actions) for agent in training_agents: agent.set_model(models) buffer = ReplayBuffer(size=args.buffer_size) epi_len, nwins = 0, 0 ex.log_scalar(f'win', 0.0, step=0) # forces start of run at 0 wins () for step_idx in range(args.n_steps): episode = generate_episode(env) buffer.insert_list(episode) if not buffer.can_sample(args.batch_size): continue epi_len += len(episode) reward = episode[-1].rewards["blue"] if episode[-1].rewards["blue"] == 1: nwins += 1 batch = buffer.sample(args.batch_size) for agent in training_agents: loss = agent.update(batch) if step_idx > 0 and step_idx % args.sync_interval == 0: agent.sync_models( ) # TODO: same models get synced for all agents => to correct ex.log_scalar(f'loss{agent.id}', loss, step=step_idx) ex.log_scalar(f'epsilon', agent.scheduler(), step=step_idx) if step_idx > 0 and step_idx % PRINT_INTERVAL == 0: s = f"Step {step_idx}: loss: {loss:8.4f} - " s += f"Average length: {epi_len/PRINT_INTERVAL:5.2f} - " s += f"win ratio: {nwins/PRINT_INTERVAL:4.3f} - " s += f"epsilon: {agent.scheduler():4.3f} - " print(s) epi_len, nwins = 0, 0 #_ = generate_episode(env, render=True) ex.log_scalar(f'length', len(episode), step=step_idx + 1) ex.log_scalar(f'win', int(episode[-1].rewards["blue"] == 1), step=step_idx + 1) ex.log_scalar(f'reward', reward, step=step_idx + 1) from os.path import expanduser home = expanduser("~") #for agent in training_agents: # agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p') torch.save(models["model"].state_dict(), home + args.path + f'RUN_{get_run_id()}.torch')
def train(args): team_blue = [PGAgent(idx, "blue") for idx in range(args.n_friends)] team_red = [ Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) elif args.env_type == 'restricted': env = RestrictedEnvironment(agents, args) args.n_actions = 6 + args.n_enemies args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies + args.n_enemies # setup model if args.model == 'FORWARD': model = ForwardModel(input_shape=args.n_inputs, n_actions=args.n_actions) elif args.model == 'RNN': model = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) for agent in training_agents: agent.set_model(model) epi_len, nwins = 0, 0 n_episodes = 0 ex.log_scalar(f'win', 0.0, step=n_episodes + 1) # forces start of run at 0 wins () for step_idx in range(int(args.n_steps / args.n_episodes_per_step)): batch = [] for _ in range(args.n_episodes_per_step): episode = generate_episode(env, args) n_episodes += 1 batch.extend(episode) epi_len += len(episode) reward = episode[-1].rewards["blue"] ex.log_scalar('length', len(episode), step=n_episodes) ex.log_scalar('reward', reward, step=n_episodes) ex.log_scalar(f'win', int(episode[-1].rewards["blue"] == 1), step=n_episodes + 1) if episode[-1].rewards["blue"] == 1: nwins += 1 for agent in training_agents: stats = agent.update(batch) ex.log_scalar(f'loss{agent.id}', stats["loss"], step=n_episodes) ex.log_scalar(f'grads{agent.id}', stats["grads_l2"], step=n_episodes) ex.log_scalar(f'grads_var{agent.id}', stats["grads_var"], step=n_episodes) s = f"Step {step_idx}: " s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - " s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - " print(s) epi_len, nwins = 0, 0 #_ = generate_episode(env, render=True) from os.path import expanduser home = expanduser("~") #for agent in training_agents: # agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p') torch.save(model.state_dict(), home + args.path + f'RUN_{get_run_id()}.torch')
def train(): team_blue = [IACAgent(idx, "blue") for idx in range(args.n_friends)] team_red = [ Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) if args.env_type == 'restricted': env = Environment2(agents, args) args.n_actions = 6 + args.n_enemies args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies # setup model models = generate_model(input_shape=args.n_inputs, n_actions=args.n_actions) for agent in training_agents: agent.set_models(models) epi_len, nwins = 0, 0 n_episodes = 0 ex.log_scalar(f'win', 0.0, step=0) # forces start of run at 0 wins () for step_idx in range(int(args.n_steps / args.n_episodes_per_step)): batch = [] for _ in range(args.n_episodes_per_step): episode = generate_episode(env) n_episodes += 1 batch.extend(episode) epi_len += len(episode) reward = episode[-1].rewards["blue"] ex.log_scalar('length', len(episode), step=n_episodes) ex.log_scalar('reward', reward, step=n_episodes) ex.log_scalar(f'win', int(episode[-1].rewards["blue"] == 1), step=n_episodes + 1) if episode[-1].rewards["blue"] == 1: nwins += 1 for agent in training_agents: stats = agent.update(batch) ex.log_scalar(f'policy_loss{agent.id}', stats['policy_loss'], step=n_episodes) ex.log_scalar(f'value_loss{agent.id}', stats['value_loss'], step=n_episodes) ex.log_scalar(f'loss{agent.id}', stats['loss'], step=n_episodes) ex.log_scalar(f'entropy{agent.id}', stats['entropy'], step=n_episodes) ex.log_scalar(f'grads{agent.id}', stats["grads_l2"], step=n_episodes) ex.log_scalar(f'grads_var{agent.id}', stats["grads_var"], step=n_episodes) if step_idx % 50 == 0: #args.sync_interval == 0: agent.sync_models() print(f'sync at {step_idx * args.n_episodes_per_step}') s = f"Step {step_idx}: " s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - " s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - " print(s) epi_len, nwins = 0, 0 #_ = generate_episode(env, render=True) from os.path import expanduser home = expanduser("~") #for agent in training_agents: # agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p') torch.save(models["model"].state_dict(), home + args.path + f'RUN_{get_run_id()}_MODEL.torch')