def __init__(self, args, env, env_params, test_env): self.args = args self.env = env self.test_env = test_env self.env_params = env_params self.device = args.device self.resume = args.resume self.resume_epoch = args.resume_epoch current_time = datetime.now().strftime('%b%d_%H-%M-%S') self.writer = SummaryWriter(log_dir='runs/ddpg'+current_time + '_' + str(args.env_name) + \ str(args.lr_critic)+'_' + str(args.gamma)+'_'+\ str(args.fps)) if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name + "_" + current_time) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.actor_network = actor(env_params) self.actor_target_network = actor(env_params) self.critic_network = criticWrapper(self.env_params, self.args) self.critic_target_network = criticWrapper(self.env_params, self.args) self.start_epoch = 0 if self.resume == True: self.start_epoch = self.resume_epoch self.actor_network.load_state_dict(torch.load(self.args.resume_path + \ '/actor_model_' +str(self.resume_epoch) +'.pt')[0]) self.critic_network.load_state_dict(torch.load(self.args.resume_path + \ '/critic_model_' +str(self.resume_epoch) +'.pt')[0]) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu self.actor_network.to(self.device) self.critic_network.to(self.device) self.actor_target_network.to(self.device) self.critic_target_network.to(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.args.distance, self.args.future_step) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) self.planner_policy = Planner(agent=self, replay_buffer=self.buffer, fps=args.fps, \ clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample)
def __init__(self, args, env, env_params, test_env): self.args = args self.device = args.device self.env = env self.test_env = test_env self.env_params = env_params self.action_n = env.action_space.n self.resume = args.resume self.resume_epoch = args.resume_epoch self.init_qnets() self.start_epoch = 0 if self.resume == True: self.start_epoch = self.resume_epoch print('resume from stored models ...') self.Q_network.load_state_dict( torch.load(self.args.path + '/q_model_' + str(self.resume_epoch) + '.pt')[0]) self.targetQ_network.load_state_dict( torch.load(self.args.path + '/q_model_' + str(self.resume_epoch) + '.pt')[0]) current_time = datetime.now().strftime('%b%d_%H-%M-%S') self.writer = SummaryWriter(log_dir='runs/dqn' + current_time + '_mc' + str(args.gamma) + '_' + str(args.fps)) if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name + "_" + current_time) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.eps = args.eps # load the weights into the target networks self.targetQ_network.load_state_dict(self.Q_network.state_dict()) # create the optimizer self.q_optim = torch.optim.Adam(self.Q_network.parameters(), lr=self.args.lr) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.args.distance) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) self.planner_policy = Planner(agent=self, replay_buffer=self.buffer, fps=args.fps, \ clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample)
def __init__(self, args, env, env_params, test_env, test_env1=None, test_env2=None): self.args = args self.env = env self.test_env = test_env self.env_params = env_params self.device = args.device self.resume = args.resume self.resume_epoch = args.resume_epoch self.not_train_low = False self.test_env1 = test_env1 self.test_env2 = test_env2 self.old_sample = args.old_sample self.low_dim = env_params['obs'] self.env_params['low_dim'] = self.low_dim self.hi_dim = env_params['obs'] print("hi_dim", self.hi_dim) self.learn_goal_space = True self.whole_obs = False # use whole observation space as subgoal space self.abs_range = abs_range = args.abs_range # absolute goal range self.feature_reg = 0.0 # feature l2 regularization print("abs_range", abs_range) if args.env_name[:5] == "Fetch": maze_low = self.env.env.initial_gripper_xpos[:2] - self.env.env.target_range maze_high = self.env.env.initial_gripper_xpos[:2] + self.env.env.target_range self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high) else: if args.env_name != "NChain-v1": self.hi_act_space = self.env.env.maze_space else: self.hi_act_space = gym.spaces.Box(low=np.array([-1]), high=np.array([1])) if self.learn_goal_space: if args.env_name == "NChain-v1": self.hi_act_space = gym.spaces.Box(low=np.array([-abs_range]), high=np.array([abs_range])) else: self.hi_act_space = gym.spaces.Box( low=np.array([-abs_range, -abs_range]), high=np.array([abs_range, abs_range])) if self.whole_obs: vel_low = [-10.] * 4 vel_high = [10.] * 4 maze_low = np.concatenate( (self.env.env.maze_low, np.array(vel_low))) maze_high = np.concatenate( (self.env.env.maze_high, np.array(vel_high))) self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high) dense_low = True self.low_use_clip = not dense_low # only sparse reward use clip if args.replay_strategy == "future": self.low_forward = True assert self.low_use_clip is True else: self.low_forward = False assert self.low_use_clip is False self.hi_sparse = (self.env.env.reward_type == "sparse") # # params of learning phi resume_phi = args.resume self.not_update_phi = False phi_path = args.resume_path # resume_phi = True # phi_path = 'saved_models/AntMaze1-v1_Jun01_19-26-19' # self.not_update_phi = True self.save_fig = False self.save_model = False self.start_update_phi = args.start_update_phi self.early_stop = args.early_stop # after success rate converge, don't update low policy and feature if args.env_name in ['AntPush-v1', 'AntFall-v1']: if self.not_update_phi: self.early_stop_thres = 900 else: self.early_stop_thres = 3500 elif args.env_name in ["PointMaze1-v1"]: self.early_stop_thres = 2000 elif args.env_name == "AntMaze1-v1": self.early_stop_thres = 3000 else: self.early_stop_thres = args.n_epochs print("early_stop_threshold", self.early_stop_thres) self.success_log = [] # scaling = self.env.env.env.MAZE_SIZE_SCALING # print("scaling", scaling) self.count_latent = False if self.count_latent: self.hash = HashingBonusEvaluator(512, 2) self.count_obs = False if self.count_obs: self.hash = HashingBonusEvaluator(512, env_params['obs']) self.high_correct = False self.k = args.c self.delta_k = 0 self.prediction_coeff = 0.0 tanh_output = False self.use_prob = False print("prediction_coeff", self.prediction_coeff) if args.save: current_time = datetime.now().strftime('%b%d_%H-%M-%S') self.log_dir = 'runs/hier/' + str(args.env_name) + '/RB_Decay_' + current_time + \ "_C_" + str(args.c) + "_Image_" + str(args.image) + \ "_Seed_" + str(args.seed) + "_Reward_" + str(args.low_reward_coeff) + \ "_NoPhi_" + str(self.not_update_phi) + "_LearnG_" + str(self.learn_goal_space) + "_Early_" + str(self.early_stop_thres) + str(args.early_stop) self.writer = SummaryWriter(log_dir=self.log_dir) if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join( self.args.save_dir, self.args.env_name + "_" + current_time) if not os.path.exists(self.model_path): os.mkdir(self.model_path) # init low-level network self.real_goal_dim = self.hi_act_space.shape[ 0] # low-level goal space and high-level action space self.init_network() # init high-level agent self.hi_agent = SAC(self.hi_dim + env_params['goal'], self.hi_act_space, args, False, env_params['goal'], args.gradient_flow_value, args.abs_range, tanh_output) self.env_params['real_goal_dim'] = self.real_goal_dim self.hi_buffer = ReplayMemory(args.buffer_size) # her sampler self.c = self.args.c # interval of high level action self.low_her_module = her_sampler( args.replay_strategy, args.replay_k, args.distance, args.future_step, dense_reward=dense_low, direction_reward=False, low_reward_coeff=args.low_reward_coeff) if args.env_name[:5] == "Fetch": self.low_buffer = replay_buffer_energy( self.env_params, self.args.buffer_size, self.low_her_module.sample_her_energy, args.env_name) else: self.low_buffer = replay_buffer( self.env_params, self.args.buffer_size, self.low_her_module.sample_her_transitions) not_load_buffer, not_load_high = True, False if self.resume is True: self.start_epoch = self.resume_epoch if not not_load_high: self.hi_agent.policy.load_state_dict(torch.load(self.args.resume_path + \ '/hi_actor_model.pt', map_location='cuda:4')[0]) # self.hi_agent.critic.load_state_dict(torch.load(self.args.resume_path + \ # '/hi_critic_model.pt', map_location='cuda:4')[0]) # print("not load low !!!") print("load low !!!") self.low_actor_network.load_state_dict(torch.load(self.args.resume_path + \ '/low_actor_model.pt', map_location='cuda:4')[0]) self.low_critic_network.load_state_dict(torch.load(self.args.resume_path + \ '/low_critic_model.pt', map_location='cuda:4')[0]) if not not_load_buffer: # self.hi_buffer = torch.load(self.args.resume_path + '/hi_buffer.pt', map_location='cuda:1') self.low_buffer = torch.load(self.args.resume_path + '/low_buffer.pt', map_location='cuda:1') # sync target network of low-level self.sync_target() if hasattr(self.env.env, 'env'): self.animate = self.env.env.env.visualize_goal else: self.animate = self.args.animate self.distance_threshold = self.args.distance if not (args.gradient_flow or args.use_prediction or args.gradient_flow_value): self.representation = RepresentationNetwork( env_params, 3, self.abs_range, self.real_goal_dim).to(args.device) if args.use_target: self.target_phi = RepresentationNetwork( env_params, 3, self.abs_range, 2).to(args.device) # load the weights into the target networks self.target_phi.load_state_dict( self.representation.state_dict()) self.representation_optim = torch.optim.Adam( self.representation.parameters(), lr=0.0001) if resume_phi is True: print("load phi from: ", phi_path) self.representation.load_state_dict(torch.load(phi_path + \ '/phi_model_4000.pt', map_location='cuda:4')[0]) elif args.use_prediction: self.representation = DynamicsNetwork(env_params, self.abs_range, 2, tanh_output=tanh_output, use_prob=self.use_prob, device=args.device).to( args.device) self.representation_optim = torch.optim.Adam( self.representation.parameters(), lr=0.0001) if resume_phi is True: print("load phi from: ", phi_path) self.representation.load_state_dict(torch.load(phi_path + \ '/phi_model_4000.pt', map_location='cuda:1')[0]) print("learn goal space", self.learn_goal_space, " update phi", not self.not_update_phi) self.train_success = 0 self.furthest_task = 0.
def __init__(self, args, env, env_params, test_env, resume=False, resume_epoch_actor=0, resume_epoch_critic=0): self.args = args self.env = env self.test_env = test_env self.env_params = env_params self.device = args.device self.resume = resume self.resume_epoch_actor = resume_epoch_actor self.resume_epoch_critic = resume_epoch_critic current_time = datetime.now().strftime('%b%d_%H-%M-%S') self.writer = SummaryWriter(log_dir='runs/ddpg'+current_time + '_' + str(args.env_name) + \ str(args.lr_critic)+'_' + str(args.gamma)+'_'+str(args.plan_rate)+'_'+\ str(args.fps)) if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # path to save the model self.model_path = os.path.join(self.args.save_dir, self.args.env_name + "_" + current_time) if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.actor_network = actor(env_params) self.plan_rate = args.plan_rate self.init_critics() self.actor_target_network = actor(env_params) if self.resume == True: self.actor_network.load_state_dict( torch.load(self.args.path + '/actor_model_' + str(self.resume_epoch_actor) + '.pt')[0]) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network.load_state_dict( self.critic_network.state_dict()) # if use gpu self.actor_network.to(self.device) self.critic_network.to(self.device) self.actor_target_network.to(self.device) self.critic_target_network.to(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) if self.args.search == True: print('here true') self.critic_optim = torch.optim.Adam( [{ 'params': self.critic_network.base.parameters() }, { 'params': self.critic_network.gamma, 'lr': 5e-5 }], lr=self.args.lr_critic) else: self.critic_optim = torch.optim.Adam( self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.args.distance) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) if args.fps == 1: self.planner_policy = Planner(agent=self, framebuffer=self.buffer, fps=True, \ clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample) else: self.planner_policy = Planner(agent=self, framebuffer=self.buffer, fps=False, \ clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample)