def prepare_environment(config_address, data_address, models_address=None): """ loads config and models, creates agents and environment """ def read_header_from_csv(agent_name): with open(data_address + agent_name + "_x.csv", "r") as f: header = list(csv.reader(f))[0] return header with open(config_address, 'r') as sim_file: jstring = sim_file.read() config = json.loads(jstring) input_names = { agent_name: read_header_from_csv(agent_name) for agent_name in config } constants = { agent_name: list( filter(lambda name: name.startswith("const_"), input_names[agent_name])) for agent_name in input_names } states = { agent_name: list( filter(lambda name: name.startswith("var_"), input_names[agent_name])) for agent_name in input_names } agents_dict = { name: Agent(constants[name], states[name], name, config[name]["hyper_parameters"]) for name in input_names } env = Environment() env.register_agents(*(agents_dict.values())) for name in agents_dict: to_agents = [ agents_dict[to_name] for to_name in config[name]['to_agents'] ] env.register_connections(agents_dict[name], *to_agents) env.compile() if models_address is not None: env.load_models(models_address) return env, agents_dict
def test_replay(model_file, agent_type='qmix', period=None): import yaml from utilities import get_args args = get_args(yaml.load(open('default_config.yaml', 'r'))) path = '/home/koen' + args.path args.gamma = 0.8 args.max_episode_length = 30 args.step_penalty = 0.05 args.a_terrain = True if agent_type == 'qmix': model = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) target = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) model.load_state_dict(torch.load(path + model_file)) target.load_state_dict(torch.load(path + model_file)) models = {"model": model, "target": target} team_blue = [ QMIXAgent(idx, "blue", args) for idx in range(args.n_friends) ] elif agent_type == 'reinforce': models = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) models.load_state_dict(torch.load(path + model_file)) team_blue = [ PGAgent(idx, "blue", args) for idx in range(args.n_friends) ] for agent in team_blue: agent.set_model(models) team_red = [ Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] agents = team_blue + team_red env = RestrictedEnvironment(agents, args) while True: episode = generate_episode(env, args) print(len(episode)) if len(episode) < 6: visualize(env, episode, period=period) break
def main(): parser = argparse.ArgumentParser(description='RL agents for atari') subparsers = parser.add_subparsers(title="subcommands", dest="subcommand") train_parser = subparsers.add_parser("train", help="train an RL agent for atari games") train_parser.add_argument("--task-id", type=int, required=True, help="0 = BeamRider, 1 = Breakout, 2 = Enduro, 3 = Pong, 4 = Qbert, 5 = Seaquest, 6 = Spaceinvaders") train_parser.add_argument("--gpu", type=int, default=None, help="ID of GPU to be used") train_parser.add_argument("--double-dqn", type=int, default=0, help="double dqn - 0 = No, 1 = Yes") train_parser.add_argument("--dueling-dqn", type=int, default=0, help="dueling dqn - 0 = No, 1 = Yes") args = parser.parse_args() # command if (args.gpu != None): if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) print("CUDA Device: %d" %torch.cuda.current_device()) # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) double_dqn = (args.double_dqn == 1) dueling_dqn = (args.dueling_dqn == 1) #env = get_env(task, seed, task.env_id, double_dqn, dueling_dqn) env_id = "MyEnv1_varyenv" print("Training on %s, double_dqn %d, dueling_dqn %d" %(env_id, double_dqn, dueling_dqn)) gridmap = np.zeros([100,100]) gridmap[40:60, 40:60] = 2 gridmap[10:13,20:23] = 1 gridmap[80:83,30:33] = 1 gridmap[75:78,90:93] = 1 env = Grid(100,100, gridmap)#, grid_state_fn=lambda x:x.get_grid_img()) #the 3rd argument tells what the state this learner needs from grid #agentlist = [DQNSeeker((0,0,0), (255,255,255)) for k in range(50)] #env.init_agents(agentlist) agentlist = [Agent((0,0,0), (255,255,255)) for k in range(25)] env.init_agents(agentlist) agent_learn(env, env_id, num_timesteps=1000000, double_dqn=double_dqn, dueling_dqn=dueling_dqn, varyenv=True)
def train_iteratively(args, agent_type): """Train agent iteratively by exchanging agent networks agent_type = 'reinforce' | 'qmix' """ # setup the agents & environment args.n_actions = 6 + args.n_enemies args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies + args.n_enemies # setup model if agent_type == 'reinforce': model = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) team_blue = [ PGAgent(idx, "blue", args) for idx in range(args.n_friends) ] elif agent_type == 'qmix': model_ = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) target = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) model = {"model": model_, "target": target} team_blue = [ QMIXAgent(idx, "blue", args) for idx in range(args.n_friends) ] team_red = [ Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red env = RestrictedEnvironment(agents, args) for agent in training_agents: agent.set_model(model) # first model if agent_type == 'reinforce': training_agents = train_agents_reinforce(env, training_agents, args) elif agent_type == 'qmix': training_agents = train_agents_qmix(env, training_agents, model, args) trained_model = copy.deepcopy(training_agents[0].model) for iteration in range(args.n_iterations): print(f'Iteration {iteration + 1}') #args.n_steps = 10000 * (iteration + 2) # adapt step size TODO: find optimal criterion, e.g. stop at certain win rate # upgrade team red if agent_type == 'reinforce': team_red = [ PGAgent(args.n_friends + idx, "red", args) for idx in range(args.n_enemies) ] elif agent_type == 'qmix': team_red = [ QMIXAgent(args.n_friends + idx, "red", args) for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red env = RestrictedEnvironment(agents, args) if agent_type == 'reinforce': opponent_model = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) opponent_model.load_state_dict(trained_model.state_dict()) opponent_model.eval() elif agent_type == 'qmix': model_ = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) model_.load_state_dict(trained_model.state_dict()) target = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) opponent_model = {"model": model_, "target": target} for agent in team_red: agent.set_model(opponent_model) if args.reset_model: if agent_type == 'reinforce': model = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) elif agent_type == 'qmix': model_ = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) target = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) model = {"model": model_, "target": target} for agent in team_blue: agent.set_model(model) if agent_type == 'reinforce': training_agents = train_agents_reinforce(env, training_agents, args) elif agent_type == 'qmix': training_agents = train_agents_qmix(env, training_agents, model, args) trained_model = copy.deepcopy(training_agents[0].model) from os.path import expanduser home = expanduser("~") torch.save(trained_model.state_dict(), home + args.path + f'RUN_{get_run_id()}_MODEL.torch')
def train(args): team_blue = [IQLAgent(idx, "blue") for idx in range(args.n_friends)] team_red = [ Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) if args.env_type == 'restricted': env = Environment2(agents, args) args.n_actions = 6 + args.n_enemies # 6 fixed actions + 1 aim action per enemy args.n_inputs = 4 + 3 * ( args.n_friends - 1) + 3 * args.n_enemies # see process function in models.py models = generate_models(args.n_inputs, args.n_actions) for agent in training_agents: agent.set_model(models) buffer = ReplayBuffer(size=args.buffer_size) epi_len, nwins = 0, 0 ex.log_scalar(f'win', 0.0, step=0) # forces start of run at 0 wins () for step_idx in range(args.n_steps): episode = generate_episode(env) buffer.insert_list(episode) if not buffer.can_sample(args.batch_size): continue epi_len += len(episode) reward = episode[-1].rewards["blue"] if episode[-1].rewards["blue"] == 1: nwins += 1 batch = buffer.sample(args.batch_size) for agent in training_agents: loss = agent.update(batch) if step_idx > 0 and step_idx % args.sync_interval == 0: agent.sync_models( ) # TODO: same models get synced for all agents => to correct ex.log_scalar(f'loss{agent.id}', loss, step=step_idx) ex.log_scalar(f'epsilon', agent.scheduler(), step=step_idx) if step_idx > 0 and step_idx % PRINT_INTERVAL == 0: s = f"Step {step_idx}: loss: {loss:8.4f} - " s += f"Average length: {epi_len/PRINT_INTERVAL:5.2f} - " s += f"win ratio: {nwins/PRINT_INTERVAL:4.3f} - " s += f"epsilon: {agent.scheduler():4.3f} - " print(s) epi_len, nwins = 0, 0 #_ = generate_episode(env, render=True) ex.log_scalar(f'length', len(episode), step=step_idx + 1) ex.log_scalar(f'win', int(episode[-1].rewards["blue"] == 1), step=step_idx + 1) ex.log_scalar(f'reward', reward, step=step_idx + 1) from os.path import expanduser home = expanduser("~") #for agent in training_agents: # agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p') torch.save(models["model"].state_dict(), home + args.path + f'RUN_{get_run_id()}.torch')
import numpy as np import pandas as pd from aggregator import DummyAggregator from env import Environment, Agent if __name__ == '__main__': env = Environment() agent1 = Agent(["c1", "c2"], ['p1', 'p2'], 'agent1') agent2 = Agent(["c1", "c2"], ['p1', 'p2', 'p3'], 'agent2') data = { agent1: { "states": pd.DataFrame({ 'p1': np.random.rand(100), 'p2': np.random.rand(100) }), "constants": pd.DataFrame({ "c1": np.random.rand(100), "c2": np.random.rand(100) }) }, agent2: { "states": pd.DataFrame({ 'p1': np.random.rand(100), 'p2': np.random.rand(100), 'p3': np.random.rand(100) }), "constants": pd.DataFrame({
def scenario_two(): """ In this scenario we have three agents: a1, a2, and a3 The relationships diagram is like this (self-loops are assumed for all agents and not represented here) a1 => {a2, a3} a2 => {a1} a3 => {a2} C(a1) = {l1, l2} S(a1) = {x1, x2, x3} C(a2) = {m1} S(a2) = {y1, y2, y3, y4} C(a3) = {} S(a3) = {z1, z2} Dynamics of a1: x1 = (y1y2)/l1 + e^(-3x1) x2 = l2 * ln(|2y4|) x3 = x1 - x2 Dynamics of a2: y1 = y4 / 4 + cos(z2) y2 = m1y1 + y2 y3 = cos(z1y2 + z2y1) y4 = 1 / (m1 + x3) - floor(x1) Dynamics z1 = sqrt(3|z2|) z2 = e^(-(z1 - 3)^2) """ env = Environment() a1 = Agent(['l1', 'l2'], ['x1', 'x2', 'x3'], 'agent1') a2 = Agent(['m1'], ['y1', 'y2', 'y3', 'y4'], 'agent2') a3 = Agent([], ['z1', 'z2'], 'agent3') env.register_agents(a1, a2, a3) env.register_connections(a1, a2, a3) env.register_connections(a2, a1) env.register_connections(a3, a2) env.compile() a1_constants = { 'l1': np.random.rand(1000), 'l2': np.random.rand(1000) } a1_states = { 'x1': np.random.rand(1000), 'x2': np.random.rand(1000), 'x3': np.random.rand(1000) } a2_constants = { 'm1': np.random.rand(1000) } a2_states = { 'y1': np.random.rand(1000), 'y2': np.random.rand(1000), 'y3': np.random.rand(1000), 'y4': np.random.rand(1000) } a3_states = { 'z1': np.random.rand(1000), 'z2': np.random.rand(1000) } data_input = { a1: { 'constants': pd.DataFrame(a1_constants), 'states': pd.DataFrame(a1_states) }, a2: { 'constants': pd.DataFrame(a2_constants), 'states': pd.DataFrame(a2_states) }, a3: { 'constants': pd.DataFrame(), 'states': pd.DataFrame(a3_states) } } a1_outputs = { 'x1': a2_states['y1'] * a2_states['y2'] / a1_constants['l1'] + np.exp(-3 * a1_states['x1']), 'x2': a1_constants['l2'] * np.log(np.abs(2 * a2_states['y4'])), 'x3': a1_states['x1'] - a1_states['x2'] } a2_outputs = { 'y1': a2_states['y2'] / 4 + np.cos(a3_states['z2']), 'y2': a2_constants['m1'] * a2_states['y1'] + a2_states['y2'], 'y3': np.cos(a3_states['z1'] * a2_states['y2'] + a3_states['z2'] * a2_states['y1']), 'y4': 1 / (a2_constants['m1'] + a1_states['x3']) - np.floor(a1_states['x1']) } a3_outputs = { 'z1': np.sqrt(3 * np.abs(a3_states['z2'])), 'z2': np.exp(- np.square(a3_states['z1'] - 3)) } data_output = { a1: pd.DataFrame(a1_outputs), a2: pd.DataFrame(a2_outputs), a3: pd.DataFrame(a3_outputs) } env.solo_train(data_input, data_output, training_hyper_params={ a1: {'epochs': 100}, a2: {'epochs': 100}, a3: {'epochs': 50}}) env.solo_test(data_input, data_output) print(env.correlation_matrix(a1))
ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1)) if episode[-1].rewards["blue"] == 1: nwins += 1 for agent in training_agents: loss = agent.update(batch) ex.log_scalar(f'loss{agent.id}', loss) s = f"Step {step_idx}: " s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - " s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - " print(s) epi_len, nwins = 0, 0 return training_agents if __name__ == '__main__': from env import Environment, Agent from settings import args team_blue = [Agent(idx, "blue") for idx in range(args.n_friends)] team_red = [ Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red env = Environment(agents) episode = generate_episode(env, render=True)
def train(args): team_blue = [QMIXAgent(idx, "blue", args) for idx in range(args.n_friends)] team_red = [ Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) if args.env_type == 'restricted': env = RestrictedEnvironment(agents, args) args.n_actions = 6 + args.n_enemies # 6 fixed actions + 1 aim action per enemy args.n_inputs = 4 + 3 * ( args.n_friends - 1 ) + 3 * args.n_enemies + args.n_enemies # see process function in models.py models = generate_models(args.n_inputs, args.n_actions, args) for agent in training_agents: agent.set_model(models) buffer = ReplayBuffer(size=args.buffer_size) mac = MultiAgentController(env, training_agents, models, args) for step_idx in range(args.n_steps): episode = generate_episode(env, args) buffer.insert_list(episode) if len(buffer) < args.batch_size: continue batch = buffer.sample(args.batch_size) loss = mac.update(batch) if step_idx % args.sync_interval == 0: mac.sync_networks() ## logging ex.log_scalar('loss', loss) if step_idx % args.log_interval == 0: episode = generate_episode(env, args, test_mode=True) if step_idx == 0: episode[-1].rewards["blue"] = 0 episode[-1].rewards["red"] = 1 ex.log_scalar('length', len(episode), step=step_idx) ex.log_scalar('reward', episode[-1].rewards["blue"], step=step_idx) ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1), step=step_idx) ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1), step=step_idx) ex.log_scalar('epsilon', training_agents[0].scheduler(), step=step_idx) if PRINT and step_idx > 0 and step_idx % PRINT_INTERVAL == 0: print( f"Step {step_idx}: loss = {loss}, reward = {episode[-1].rewards['blue']}" ) #episode = generate_episode(env, render=True) if args.save_model and step_idx > 0 and step_idx % args.save_model_interval == 0: from os.path import expanduser home = expanduser("~") torch.save(models["model"].state_dict(), home + args.path + f'RUN_{get_run_id()}_MODEL.torch') if args.use_mixer: torch.save( mac.mixer.state_dict(), home + args.path + f'RUN_{get_run_id()}_MIXER.torch')