def __init__(self, id, num_inputs, action_dim, hidden_size, gamma, critic_lr, actor_lr, tau, alpha, target_update_interval, savetag, foldername, actualize, use_gpu): self.num_inputs = num_inputs self.action_space = action_dim self.gamma = gamma self.tau = 0.005 self.alpha = 0.2 self.policy_type = "Gaussian" self.target_update_interval = 1 self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'value_'+savetag, 'value_loss_'+savetag, 'policy_loss_'+savetag, 'mean_loss_'+savetag, 'std_loss_'+savetag], '.csv',save_iteration=1000, conv_size=1000) self.total_update = 0 self.agent_id = id self.actualize = actualize self.critic = QNetwork(self.num_inputs, self.action_space, hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.soft_q_criterion = nn.MSELoss() if self.policy_type == "Gaussian": self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='GaussianPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.value = ValueNetwork(self.num_inputs, hidden_size) self.value_target = ValueNetwork(self.num_inputs, hidden_size) self.value_optim = Adam(self.value.parameters(), lr=critic_lr) utils.hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.policy = Actor(self.num_inputs, self.action_space, hidden_size, policy_type='DeterministicPolicy') self.policy_optim = Adam(self.policy.parameters(), lr=actor_lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, hidden_size) utils.hard_update(self.critic_target, self.critic) self.policy.cuda() self.value.cuda() self.value_target.cuda() self.critic.cuda() #Statistics Tracker self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.val = {'min':None, 'max': None, 'mean':None, 'std':None} self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.mean_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.std_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag], '.csv', save_iteration=1000, conv_size=1000) self.num_agents = num_agents #Initialize actors self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) if init_w: self.policy.apply(utils.init_weights) self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critics = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)] self.critics_target = [QNetwork(state_dim*num_agents, action_dim*num_agents, hidden_size*3) for _ in range(num_agents)] if init_w: for critic, critic_target in zip(self.critics, self.critics_target): critic.apply(utils.init_weights) utils.hard_update(critic_target, critic) self.critic_optims = [Adam(critic.parameters(), critic_lr) for critic in self.critics] self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.policy.cuda() for critic, critic_target in zip(self.critics, self.critics_target): critic.cuda() critic_target.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, num_agents, init_w = True): self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id; self.actualize = actualize; self.use_gpu = use_gpu self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000) #Initialize actors self.policy = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) if init_w: self.policy.apply(utils.init_weights) self.policy_target = MultiHeadActor(state_dim, action_dim, hidden_size, num_agents) utils.hard_update(self.policy_target, self.policy) self.policy_optim = Adam(self.policy.parameters(), actor_lr) self.critic = QNetwork(state_dim, action_dim,hidden_size) if init_w: self.critic.apply(utils.init_weights) self.critic_target = QNetwork(state_dim, action_dim, hidden_size) utils.hard_update(self.critic_target, self.critic) self.critic_optim = Adam(self.critic.parameters(), critic_lr) if actualize: self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size) if init_w: self.ANetwork.apply(utils.init_weights) self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr) self.actualize_lr = 0.2 if use_gpu: self.ANetwork.cuda() self.loss = nn.MSELoss() if use_gpu: self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda() self.num_critic_updates = 0 #Statistics Tracker #self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None} self.q = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None} self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
champ_wwid = int(self.rollout_bucket[0].wwid.item()) all_fitness = [fitness] max_fit = fitness all_eplens = [num_frames] return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid if __name__ == "__main__": args = Parameters() # Create the Parameters class SAVETAG = SAVETAG + '_p' + str(PORTFOLIO_ID) SAVETAG = SAVETAG + '_s' + str(SEED) if ISOLATE_PG: SAVETAG = SAVETAG + '_pg' frame_tracker = utils.Tracker(args.savefolder, ['score_' + ENV_NAME + SAVETAG], '.csv') #Tracker class to log progress max_tracker = utils.Tracker( args.aux_folder, ['pop_max_score_' + ENV_NAME + SAVETAG], '.csv') #Tracker class to log progress FOR MAX (NOT REPORTED) #Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) #INITIALIZE THE MAIN AGENT CLASS agent = CERL_Agent(args) #Initialize the agent print('Running CERL for', ENV_NAME, 'State_dim:', args.state_dim, ' Action_dim:', args.action_dim)
# Evolution Step for agent in self.agents: agent.evolve() #Save models periodically if gen % 20 == 0: for id, test_actor in enumerate(self.test_agent.rollout_actor): torch.save(test_actor.state_dict(), self.args.model_save + str(id) + '_' + self.args.actor_fname) print("Models Saved") return all_fits, pg_fits, test_fits if __name__ == "__main__": args = Parameters() # Create the Parameters class test_tracker = utils.Tracker(args.metric_save, [args.log_fname], '.csv') # Initiate tracker torch.manual_seed(args.seed); np.random.seed(args.seed); random.seed(args.seed) # Seeds if args.config.env_choice == 'hyper': from envs.hyper.PowerPlant_env import Fast_Simulator # Main Module needs access to this class for some reason # INITIALIZE THE MAIN AGENT CLASS ai = MERL(args) print('Running ', args.config.env_choice, 'with config ', args.config.config, ' State_dim:', args.state_dim, 'Action_dim', args.action_dim) time_start = time.time() ###### TRAINING LOOP ######## for gen in range(1, 10000000000): # RUN VIRTUALLY FOREVER # ONE EPOCH OF TRAINING
#Synch RL Agent to NE if self.num_games % self.args.synch_period == 0: self.rl_to_evo(self.rl_agent.actor, self.pop[worst_index]) self.evolver.rl_policy = worst_index print('Synch from RL --> Nevo') # print("ddpg time:", (time.time()-time_evolution)/3600) return best_train_fitness, test_score, elite_index if __name__ == "__main__": num_processes = 4 parameters = Parameters() # Create the Parameters class tracker = utils.Tracker(parameters, ['erl'], '_score.csv') # Initiate tracker frame_tracker = utils.Tracker(parameters, ['frame_erl'], '_score.csv') # Initiate tracker time_tracker = utils.Tracker(parameters, ['time_erl'], '_score.csv') #Create Env env = utils.NormalizedActions(gym.make(env_tag)) parameters.action_dim = env.action_space.shape[0] parameters.state_dim = env.observation_space.shape[0] #Seed env.seed(parameters.seed) torch.manual_seed(parameters.seed) np.random.seed(parameters.seed) random.seed(parameters.seed)
#Off-PG for Subs if len(self.replay_buffer) > self.args.batch_size * 5: transitions = self.replay_buffer.sample(self.args.batch_size) batch = replay_memory.Transition(*zip(*transitions)) self.agent.learn_sub(torch.cat(batch.state), torch.cat(batch.action), torch.cat(batch.next_state), self.args.sub_gamma, self.args.num_gradient_steps) return score if __name__ == "__main__": parameters = Parameters() # Create the Parameters class tracker = utils.Tracker(parameters, ['sokoban'], '_score.csv') # Initiate tracker #Create Env env = Grid_Soccer.GridBallWorld() #Seed torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) #Create Agent learner = Learner(parameters, env) for gen in range(1000000): score = learner.train() print('#Gen:', gen, ' Score:', '%.2f' % score, 'Buffer_Size',
self.rl_agent.update_parameters(batch) #Synch RL Agent to NE if self.num_games % 10 == 0 and self.args.use_evo: self.rl_to_evo(self.rl_agent.actor, self.pop[worst_index]) print('Synch from RL --> Nevo') else: rl_score = None return best_train_fitness, test_score, rl_score, elite_index if __name__ == "__main__": parameters = Parameters() # Create the Parameters class tracker = utils.Tracker(parameters, ['score', 'steps'], '_score.csv') # Initiate tracker #frame_tracker = utils.Tracker(parameters, ['frame_evo', 'frame_rl'], '_score.csv') # Initiate tracker #time_tracker = utils.Tracker(parameters, ['time_evo', 'time_rl'], '_score.csv') if False: #Deepmind Suite env = suite.load(domain_name=env_name, task_name=task_name) parameters.action_dim = env.action_spec().shape[0] state = env.observation_spec() shape = 0 for key, value in state.items(): if len(value.shape) != 0: shape += value.shape[0] else: shape += 1 parameters.state_dim = shape else: #OpenAI env = gym.make(env_tag)
parameters = Parameters() # Create the Parameters class #################### PRCOESS FILENAMES TO SAVE PROGRESS ################################ parameters.critic_fname = shape_filename(parameters.critic_fname, parameters) + SAVE_TAG parameters.actor_fname = shape_filename(parameters.actor_fname, parameters) + SAVE_TAG parameters.log_fname = shape_filename(parameters.log_fname, parameters) + SAVE_TAG parameters.best_fname = shape_filename(parameters.best_fname, parameters) + SAVE_TAG #################################################### # frame_tracker = utils.Tracker( parameters.metric_save, [parameters.log_fname + '_1', parameters.log_fname + '_2'], '.csv') # Initiate tracker ml_tracker = utils.Tracker(parameters.aux_save, [ parameters.log_fname + 'critic_loss', parameters.log_fname + 'policy_loss' ], '.csv') # Initiate tracker torch.manual_seed(parameters.seed) np.random.seed(parameters.seed) random.seed(parameters.seed) #Seeds # INITIALIZE THE MAIN AGENT CLASS agent = PG_ALGO(parameters) print('Running', parameters.algo, ' State_dim:', parameters.state_dim, ' Action_dim:', parameters.action_dim, 'for', 'Round 1' if DIFFICULTY == 0 else 'Round 2') time_start = time.time()
parser.add_argument('-save_periodic', help='Save actor, critic and memory periodically', action='store_true') parser.add_argument('-next_save', help='Generation save frequency for save_periodic', type=int, default=200) parser.add_argument('-test_operators', help='Runs the operator runner to test the operators', action='store_true') ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) if __name__ == "__main__": parameters = Parameters( parser) # Inject the cla arguments in the parameters object tracker = utils.Tracker(parameters, ['erl'], '_score.csv') # Initiate tracker frame_tracker = utils.Tracker(parameters, ['frame_erl'], '_score.csv') # Initiate tracker time_tracker = utils.Tracker(parameters, ['time_erl'], '_score.csv') ddpg_tracker = utils.Tracker(parameters, ['ddpg'], '_score.csv') selection_tracker = utils.Tracker(parameters, ['elite', 'selected', 'discarded'], '_selection.csv') # Create Env env = utils.NormalizedActions(gym.make(parameters.env_name)) parameters.action_dim = env.action_space.shape[0] parameters.state_dim = env.observation_space.shape[0] # Write the parameters to a the info file and print them parameters.write_params(stdout=True)
# Evolution Step self.agents.evolve() #Save models periodically if gen % 20 == 0: torch.save(self.test_agent.predator[0].state_dict(), self.args.model_save + 'predator_' + self.args.savetag) torch.save(self.test_agent.prey[0].state_dict(), self.args.model_save + 'prey_' + self.args.savetag) print("Models Saved") return all_fits, pg_fits, test_fits, prey_score if __name__ == "__main__": args = Parameters() # Create the Parameters class test_tracker = utils.Tracker(args.metric_save, [args.log_fname], '.csv') # Initiate tracker prey_tracker = utils.Tracker(args.metric_save, ['prey_'+args.log_fname], '.csv') # Initiate tracker selects_tracker = utils.Tracker(args.metric_save, ['selects_' + args.log_fname], '.csv') torch.manual_seed(args.seed); np.random.seed(args.seed); random.seed(args.seed) # Seeds if args.config.env_choice == 'hyper': from envs.hyper.PowerPlant_env import Fast_Simulator # Main Module needs access to this class for some reason # INITIALIZE THE MAIN AGENT CLASS ai = MERL(args) print('Running ', args.config.env_choice, 'with config ', args.config.config, ' Predator State_dim:', args.pred_state_dim, 'Prey_state_dim', args.prey_state_dim, 'Action_dim', args.action_dim) time_start = time.time() ###### TRAINING LOOP ######## for gen in range(1, 10000000000): # RUN VIRTUALLY FOREVER
#NeuroEvolution's probabilistic selection and recombination step self.evolver.epoch(self.pop, all_net_ids, all_fitness, all_shaped_fitness) # Synch RL Agent to NE periodically if gen % 5 == 0: self.evolver.sync_rl(self.args.rl_models, self.pop) return max(all_fitness), all_eplens[all_fitness.index( max(all_fitness))], all_fitness, all_eplens, all_shaped_fitness if __name__ == "__main__": parameters = Parameters() # Create the Parameters class frame_tracker = utils.Tracker(parameters.metric_save, ['erl', 'eugenics'], '.csv') #Tracker class to log progress #Set seeds torch.manual_seed(parameters.seed) np.random.seed(parameters.seed) random.seed(parameters.seed) #INITIALIZE THE MAIN AGENT CLASS agent = ERL_Agent(parameters) #Initialize the agent print('Running osim-rl', ' State_dim:', parameters.state_dim, ' Action_dim:', parameters.action_dim, 'using ERL for ', 'Round 1' if DIFFICULTY == 0 else 'Round 2') time_start = time.time() for gen in range(1, 1000000000): #Infinite generations gen_time = time.time()