def __init__(self, env_name, params): self.env = envs.make(env_name) self.params = params self.action_bound = self.env.action_bound[1] self.iterations = params["iterations"] self.mem_len = params["mem_len"] self.seed = params["seed"] self.render = params["render"] self.log_interval = params["log_interval"] self.warmup = params["warmup"] self.batch_size = params["batch_size"] self.save = params["save"] hidden_dim = params["hidden_dim"] state_dim = self.env.observation_space action_dim = self.env.action_space cuda = params["cuda"] network_settings = params["network_settings"] actor = utils.Actor(state_dim, hidden_dim, action_dim) target_actor = utils.Actor(state_dim, hidden_dim, action_dim) critic = utils.Critic(state_dim+action_dim, hidden_dim, 1) target_critic = utils.Critic(state_dim+action_dim, hidden_dim, 1) self.memory = utils.ReplayMemory(1000000) self.agent = sw.Sleepwalk(actor, critic, target_actor, target_critic, network_settings, GPU=cuda) self.noise = utils.OUNoise(action_dim) self.noise.set_seed(self.seed) self.memory = utils.ReplayMemory(self.mem_len) self.pol_opt = torch.optim.Adam(actor.parameters()) self.crit_opt = torch.optim.Adam(critic.parameters()) if cuda: self.Tensor = torch.cuda.FloatTensor else: self.Tensor = torch.Tensor if self.render: self.env.init_rendering() self.best = None # initialize experiment logging self.logging = params["logging"] if self.logging: self.directory = os.getcwd() filename = self.directory + "/data/qprop.csv" with open(filename, "w") as csvfile: self.writer = csv.writer(csvfile) self.writer.writerow(["episode", "reward"]) self.train() else: self.train()
def __init__(self, agent_dict={}, actor_dict={}, critic_dict={}): """ Initialize Agent object Params ====== agent_dict(dict): dictionary containing parameters for agent actor_dict(dict): dictionary containing parameters for agents actor-model critic_dict(dict): dictionary containing parameters for agents critic-model """ enable_cuda = agent_dict.get("enable_cuda", False) if enable_cuda: self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") else: self.device = torch.device("cpu") self.num_agents = agent_dict.get("num_agents", 20) self.num_episodes = agent_dict.get("num_episodes", 10000) self.save_after = agent_dict.get("save_after", -1) self.name = agent_dict.get("name", "reacher") self.gamma = agent_dict.get("gamma", 0.9) self.tau = agent_dict.get("tau", 0.001) self.noise = utils.OUNoise((self.num_agents, 4), 0) self.num_replays = agent_dict.get("num_replays", 1) self.learning_rate_actor = agent_dict.get("learning_rate_actor", 1E-3) self.learning_rate_critic = agent_dict.get("learning_rate_critic", 1E-3) self.criterion = nn.MSELoss() memory_size = agent_dict.get("memory_size", 2**14) batchsize = agent_dict.get("batchsize", 2**10) replay_reg = agent_dict.get("replay_reg", 0.0) self.replay_buffer = utils.ReplayBuffer(memory_size, batchsize) self.actor = model.ActorModel(actor_dict).to(self.device) self.actor_target = model.ActorModel(actor_dict).to(self.device) self.critic = model.CriticModel(critic_dict).to(self.device) self.critic_target = model.CriticModel(critic_dict).to(self.device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.learning_rate_critic) utils.copy_model(self.actor, self.actor_target, tau=1.0) utils.copy_model(self.critic, self.critic_target, tau=1.0) seed = agent_dict.get("seed", 0) torch.manual_seed(seed) np.random.seed(seed)
def __init__(self, obs_size, act_size, seed=0, params=None, logger=None): """ Initialize a Deep Deterministic Policy Gradient (DDPG) agent. Parameters ---------- obs_size : number Number of observation elements. act_size : number Number of action elements. seed : number, optional Random seed. The default is 0. params : Hyperparameters data structure. """ if params is None: params = ddpg_params() # logger for storing training data self.logger = logger # parameters self.params = params self.step_count = 0 if not torch.cuda.is_available() and self.params['device'] != 'cpu': print("GPU is not available. Selecting CPU...") self.params['device'] = 'cpu' # initialize actor self.actor = models.DeterministicActor(obs_size, act_size, seed).to(self.params['device']) self.target_actor = models.DeterministicActor(obs_size, act_size, seed) self.target_actor.load_state_dict(self.actor.state_dict()) # initialize critic self.critic = models.QCritic(obs_size, act_size, seed).to(self.params['device']) self.target_critic = models.QCritic(obs_size, act_size, seed) self.target_critic.load_state_dict(self.critic.state_dict()) # create optimizers self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.params['actor_lr']) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.params['critic_lr']) # Experience replay self.buffer = utils.ExperienceBuffer(obs_size, act_size, params['buffer_length']) # Noise model self.noise_model = utils.OUNoise(size=act_size, mean=self.params['noise_mean'], mac=self.params['noise_mac'], var=self.params['noise_var'], varmin=self.params['noise_var_min'], decay=self.params['noise_decay'])
def main(): env = envs.make("model_training") max_rpm = env.action_bound[1] action_dim = env.action_space state_dim = env.observation_space epochs = 250000 hidden_dim = 64 dyn = model.Transition(state_dim, action_dim, hidden_dim, GPU) if GPU: dyn = dyn.cuda() Tensor = torch.cuda.FloatTensor else: Tensor = torch.Tensor counter = 0 running = True H = 100 noise = utils.OUNoise(action_dim, mu=10) env.init_rendering() while running: # set random state state = Tensor(env.reset()) state_actions = [] next_states = [] # run trajectory loss = 0 for i in range(1, H+1): action = np.array([noise.noise()], dtype="float32")*env.action_bound[1] action_tensor = torch.from_numpy(action) if GPU: action_tensor = action_tensor.cuda() state_action = torch.cat([state, action_tensor],dim=1) next_state, _, _, _ = env.step(action.reshape(action_dim,)) #env.render() next_state = Tensor(next_state) state_actions.append(state_action) next_states.append(next_state) state = next_state state_actions = torch.stack(state_actions).squeeze(1) next_states = torch.stack(next_states).squeeze(1) #print(state_actions.size()) #print(next_states.size()) traj = {"state_actions": state_actions, "next_states": next_states} loss = dyn.batch_update(traj) print("---Model loss: {:.8f}---".format(loss)) counter += 1 if counter > epochs: running = False print("Saving figures") directory = os.getcwd() """ fig1.savefig(directory+"/figures/one_step_loss.pdf", bbox_inches="tight") """ print("Saving model") torch.save(dyn, directory+"/saved_models/one_step.pth.tar")
def __init__(self, env_name, params): self.env = envs.make(env_name) self.params = params self.action_bound = self.env.action_bound[1] self.iterations = params["iterations"] self.seed = params["seed"] self.render = params["render"] self.log_interval = params["log_interval"] self.save = params["save"] self.cuda = params["cuda"] state_dim = self.env.observation_space action_dim = self.env.action_space hidden_dim = params["hidden_dim"] network_settings = params["network_settings"] pi = utils.Actor(state_dim, hidden_dim, action_dim) beta = utils.Actor(state_dim, hidden_dim, action_dim) critic = utils.Critic(state_dim, hidden_dim, 1) self.agent = fmis.FMIS(pi, beta, critic, self.env, network_settings, GPU=self.cuda) self.pi_optim = torch.optim.Adam(self.agent.parameters()) self.memory = fmis.ReplayMemory(1000000) if self.cuda: self.Tensor = torch.cuda.FloatTensor else: self.Tensor = torch.Tensor if self.render: self.env.init_rendering() self.best = None # use OU noise to explore and learn the model for n warmup episodes self.noise = utils.OUNoise(action_dim, mu=10) self.warmup = 5 # initialize experiment logging self.logging = params["logging"] if self.logging: self.directory = os.getcwd() filename = self.directory + "/data/fmis.csv" with open(filename, "w") as csvfile: self.writer = csv.writer(csvfile) self.writer.writerow(["episode", "reward"]) self.train() else: self.train()
def run_hiro(args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) if not os.path.exists(os.path.join(args.log_dir, args.log_file)): os.makedirs(os.path.join(args.log_dir, args.log_file)) env = gym.make(args.env_name) obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] # # Write Hyperparameters to file # print("---------------------------------------") # print("Current Arguments:") # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f: # for arg in vars(args): # print("{}: {}".format(arg, getattr(args, arg))) # f.write("{}: {}\n".format(arg, getattr(args, arg))) # print("---------------------------------------\n") writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.log_file)) # torch.cuda.set_device(0) env_name = type(env).__name__ file_name = 'hiro_{}'.format(env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = state.shape[0] goal_dim = goal.shape[0] action_dim = env.action_space.shape[0] max_action = int(env.action_space.high[0]) # Initialize policy, replay buffers controller_policy = hiro.Controller(state_dim=state_dim, goal_dim=state_dim, action_dim=action_dim, max_action=max_action, actor_lr=args.ctrl_act_lr, critic_lr=args.ctrl_crit_lr, ctrl_rew_type=args.ctrl_rew_type) manager_policy = hiro.Manager(state_dim=state_dim, goal_dim=goal_dim, action_dim=state_dim, actor_lr=args.man_act_lr, critic_lr=args.man_crit_lr, candidate_goals=args.candidate_goals) calculate_controller_reward = hiro_controller_reward if args.noise_type == "ou": man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma) ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma) elif args.noise_type == "normal": man_noise = utils.NormalNoise(sigma=args.man_noise_sigma) ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma) manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size) controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size) # Logging Parameters total_timesteps = 0 timesteps_since_eval = 0 timesteps_since_manager = 0 timesteps_since_subgoal = 0 episode_num = 0 done = True evaluations = [] while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print('Training Controller...') ctrl_act_loss, ctrl_crit_loss = controller_policy.train( controller_buffer, episode_timesteps, args.ctrl_batch_size, args.discount, args.ctrl_tau) writer.add_scalar('data/controller_actor_loss', ctrl_act_loss, total_timesteps) writer.add_scalar('data/controller_critic_loss', ctrl_crit_loss, total_timesteps) writer.add_scalar('data/controller_ep_rew', episode_reward, total_timesteps) writer.add_scalar('data/manager_ep_rew', manager_transition[4], total_timesteps) # Train Manager if timesteps_since_manager >= args.train_manager_freq: print('Training Manager...') timesteps_since_manager = 0 man_act_loss, man_crit_loss = manager_policy.train( controller_policy, manager_buffer, ceil(episode_timesteps / args.train_manager_freq), args.man_batch_size, args.discount, args.man_tau) writer.add_scalar('data/manager_actor_loss', man_act_loss, total_timesteps) writer.add_scalar('data/manager_critic_loss', man_crit_loss, total_timesteps) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval = 0 avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy( env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) writer.add_scalar('eval/avg_ep_rew', avg_ep_rew, total_timesteps) writer.add_scalar('eval/avg_controller_rew', avg_controller_rew, total_timesteps) writer.add_scalar('eval/avg_steps_to_finish', avg_steps, total_timesteps) writer.add_scalar('eval/perc_env_goal_achieved', avg_env_finish, total_timesteps) evaluations.append( [avg_ep_rew, avg_controller_rew, avg_steps]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations) # Process final state/obs, store manager transition, if it was not just created if len(manager_transition[-2]) != 1: manager_transition[1] = state manager_transition[5] = float(True) # Every manager transition should have same length of sequences if len(manager_transition[-2] ) <= args.manager_propose_freq: while len(manager_transition[-2] ) <= args.manager_propose_freq: manager_transition[-1].append(np.inf) manager_transition[-2].append(state) manager_buffer.add(manager_transition) # Reset environment obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] """ obs = env.reset() => {"observation", "achieved_goal", "desired_goal"} (10, ) (3, ) (3, ) goal = obs['desired_goal'] => (3, ) state = obs['observation'] => (10, ) """ done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Create new manager transition subgoal = manager_policy.sample_goal(state, goal) timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # TODO: Scale action to environment action = controller_policy.select_action(state, subgoal) action = ctrl_noise.perturb_action(action, max_action) # Perform action, get (nextst, r, d) next_tup, manager_reward, env_done, _ = env.step(action) # Update cumulative reward (env. reward) for manager manager_transition[4] += manager_reward * args.man_rew_scale # Process next_goal = obs['desired_goal'] next_state = obs['observation'] # Append low level sequence for off policy correction manager_transition[-1].append(action) manager_transition[-2].append(next_state) # Calculate reward, transition subgoal controller_reward = calculate_controller_reward( state, subgoal, next_state, args.ctrl_rew_scale) subgoal = controller_policy.subgoal_transition(state, subgoal, next_state) # Is the episode over? if env_done: done = True episode_reward += controller_reward # Store low level transition controller_buffer.add( ( state, next_state, subgoal, \ action, controller_reward, float(done), \ [], [] ) ) # Update state parameters state = next_state goal = next_goal # Update counters episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 timesteps_since_manager += 1 timesteps_since_subgoal += 1 if timesteps_since_subgoal % args.manager_propose_freq == 0: # Finish, add transition manager_transition[1] = state manager_transition[5] = float(True) manager_buffer.add(manager_transition) subgoal = manager_policy.sample_goal(state, goal) subgoal = man_noise.perturb_action(subgoal, max_action=np.inf) # Reset number of timesteps since we sampled a subgoal timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # Final evaluation evaluations.append([ evaluate_policy(env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) ]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations)
batch_size = 64 DDPG = ut.DDPG(dim_state, device) print("NN initalised") # initalise noise adding # Training process EPISODES = trange(N_episodes, desc='Episode: ', leave=True) best_loss = 0 q = 0 for i in EPISODES: # intialise sampler ou_noise = ut.OUNoise() # Reset enviroment data done = False state = env.reset() # intialise variables total_episode_reward = 0. t = 0 temp_loss = [] while not done: # Create state tensor, remember to use single precision (torch.float32)
def __init__(self, env_name, params): # initialize environment self.env = envs.make(env_name) self.env_name = env_name # save important experiment parameters for the training loop self.iterations = params["iterations"] self.mem_len = params["mem_len"] self.seed = params["seed"] self.render = params["render"] self.log_interval = params["log_interval"] self.warmup = params["warmup"] self.batch_size = params["batch_size"] self.save = params["save"] # initialize DDPG agent using experiment parameters from config file self.action_bound = self.env.action_bound[1] state_dim = self.env.observation_space action_dim = self.env.action_space hidden_dim = params["hidden_dim"] cuda = params["cuda"] network_settings = params["network_settings"] actor = ddpg.Actor(state_dim, hidden_dim, action_dim) target_actor = ddpg.Actor(state_dim, hidden_dim, action_dim) critic = utils.Critic(state_dim + action_dim, hidden_dim, 1) target_critic = utils.Critic(state_dim + action_dim, hidden_dim, 1) self.agent = ddpg.DDPG(actor, target_actor, critic, target_critic, network_settings, GPU=cuda) # intitialize ornstein-uhlenbeck noise for random action exploration ou_scale = params["ou_scale"] ou_mu = params["ou_mu"] ou_sigma = params["ou_sigma"] self.noise = utils.OUNoise(action_dim, scale=ou_scale, mu=ou_mu, sigma=ou_sigma) self.noise.set_seed(self.seed) self.memory = utils.ReplayMemory(self.mem_len) self.pol_opt = torch.optim.Adam(actor.parameters()) self.crit_opt = torch.optim.Adam(critic.parameters()) # want to save the best policy self.best = None # send to GPU if flagged in experiment config file if cuda: self.Tensor = torch.cuda.FloatTensor self.agent = self.agent.cuda() else: self.Tensor = torch.Tensor if self.render: self.env.init_rendering() # initialize experiment logging. This wipes any previous file with the same name self.logging = params["logging"] if self.logging: self.directory = os.getcwd() filename = self.directory + "/data/ddpg.csv" with open(filename, "w") as csvfile: self.writer = csv.writer(csvfile) self.writer.writerow(["episode", "reward"]) self.train() else: self.train()
def __init__(self, env_name, params): # initialize environment self.__env = gym.make(env_name) self.__env_name = env_name # save important experiment parameters for the training loop self.__iterations = params["iterations"] self.__mem_len = params["mem_len"] self.__seed = params["seed"] self.__render = params["render"] self.__log_interval = params["log_interval"] self.__warmup = params["warmup"] self.__batch_size = params["batch_size"] self.__learning_updates = params["learning_updates"] self.__save = params["save"] # initialize DDPG agent using experiment parameters from config file state_dim = self.__env.observation_space.shape[0] action_dim = self.__env.action_space.shape[0] hidden_dim = params["hidden_dim"] cuda = params["cuda"] network_settings = params["network_settings"] actor = Actor(state_dim, hidden_dim, action_dim) target_actor = Actor(state_dim, hidden_dim, action_dim) critic = utils.Critic(state_dim + action_dim, hidden_dim, 1) target_critic = utils.Critic(state_dim + action_dim, hidden_dim, 1) self.__agent = DDPG(actor, target_actor, critic, target_critic, network_settings, GPU=cuda) # intitialize ornstein-uhlenbeck noise for random action exploration ou_scale = params["ou_scale"] ou_mu = params["ou_mu"] ou_sigma = params["ou_sigma"] self.__noise = utils.OUNoise(action_dim, scale=ou_scale, mu=ou_mu, sigma=ou_sigma) self.__noise.set_seed(self.__seed) self.__memory = ReplayMemory(self.__mem_len) self.__pol_opt = torch.optim.Adam(actor.parameters(), params["actor_lr"]) self.__crit_opt = torch.optim.Adam(critic.parameters(), params["critic_lr"]) # want to save the best policy self.__best = None # send to GPU if flagged in experiment config file if cuda: self.__Tensor = torch.cuda.FloatTensor self.__agent = self.__agent.cuda() else: self.__Tensor = torch.Tensor # initialize experiment logging. This wipes any previous file with the same name self.__logging = params["logging"] self.__directory = os.getcwd() if self.__logging: filename = self.__directory + "/data/ddpg-" + self.__env_name + ".csv" with open(filename, "w") as csvfile: self.__writer = csv.writer(csvfile) self.__writer.writerow(["episode", "reward"]) self._run_algo() else: self._run_algo()
SIGMA_INIT = float(conf.get('actor', 'sigma_init')) ADJUST_STEP = int(conf.get('actor', 'adjust_step')) P_LEARNING_RATE = float(conf.get('actor', 'learning_rate')) Q_HIDDEN_SIZE = int(conf.get('critic', 'hidden_size')) Q_LEARNING_RATE = float(conf.get('critic', 'learning_rate')) BATCH_SIZE = int(conf.get('main', 'batch_size')) NUM_PARAL = int(conf.get('main', 'num_paral')) AUDIO_SEGMENT = int(conf.get('main', 'audio_segment')) frameRate_Hz = int(conf.get('main', 'frameRate_Hz')) ### Condition Setting device = 'cuda' if torch.cuda.is_available() else 'cpu' policy = models.stacked_BLSTM(IN_SIZE, OUT_SIZE, P_HIDDEN_SIZE, P_NUM_LAYERS, SIGMA_INIT).to(device) q_func = models.Qfunction(IN_SIZE, OUT_SIZE, Q_HIDDEN_SIZE).to(device) ou_noise = utils.OUNoise(BATCH_SIZE, OUT_SIZE) loss_fun = nn.MSELoss(reduction='none') p_optim = torch.optim.SGD(policy.parameters(), lr=P_LEARNING_RATE) q_optim = torch.optim.Adam(q_func.parameters(), lr=Q_LEARNING_RATE) train_loader = utils.Batch_generator('training', BATCH_SIZE) policy.load_state_dict(torch.load('exp/pretrain.model')) #p_optim.load_state_dict(torch.load('exp/p_optim.state')) #q_func.load_state_dict(torch.load('exp/q1000.model')) #q_optim.load_state_dict(torch.load('exp/q_optim.state')) for iteration in range(10000000): policy.train() q_func.train() start = time.time()