def init(config, agent='robot', her=False, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME: env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp']) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME: env = gym.make(ENV_NAME, obj_action_type=config['obj_action_type']) n_rob_actions = 20 n_actions = 1 * len(config['obj_action_type']) + n_rob_actions else: env = gym.make(ENV_NAME) def her_reward_fun(ag_2, g, info): # vectorized return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) env.seed(SEED) K.manual_seed(SEED) np.random.seed(SEED) #if config['obj_action_type'] == 'all': # n_actions = config['max_nb_objects'] * 7 + 4 #elif config['obj_action_type'] == 'slide_only': # n_actions = config['max_nb_objects'] * 3 + 4 #elif config['obj_action_type'] == 'rotation_only': # n_actions = config['max_nb_objects'] * 4 + 4 observation_space = env.observation_space.spaces['observation'].shape[ 1] + env.observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) GAMMA = config['gamma'] TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] OUT_FUNC = K.tanh if config['agent_alg'] == 'DDPG_BD': MODEL = DDPG_BD from olbr.replay_buffer import ReplayBuffer from olbr.her_sampler import make_sample_her_transitions elif config['agent_alg'] == 'MADDPG_BD': MODEL = MADDPG_BD from olbr.replay_buffer import ReplayBuffer_v2 as ReplayBuffer from olbr.her_sampler import make_sample_her_transitions_v2 as make_sample_her_transitions #exploration initialization if agent == 'robot': agent_id = 0 noise = (Noise(action_space[0].shape[0], sigma=0.2, eps=0.3), Noise(action_space[1].shape[0], sigma=0.2, eps=0.3)) env._max_episode_steps *= config['max_nb_objects'] elif agent == 'object': agent_id = 1 #noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.1) noise = Noise(action_space[1].shape[0], sigma=0.2, eps=0.3) #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS, agent_id=agent_id, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy, reward_fun=reward_fun, masked_with_r=config['masked_with_r']) normalizer = [Normalizer(), Normalizer()] #memory initilization if her: sample_her_transitions = make_sample_her_transitions( 'future', 4, her_reward_fun) else: sample_her_transitions = make_sample_her_transitions( 'none', 4, her_reward_fun) buffer_shapes = { 'o': (env._max_episode_steps, env.observation_space.spaces['observation'].shape[1] * 2), 'ag': (env._max_episode_steps, env.observation_space.spaces['achieved_goal'].shape[0]), 'g': (env._max_episode_steps, env.observation_space.spaces['desired_goal'].shape[0]), 'u': (env._max_episode_steps - 1, action_space[2].shape[0]) } memory = (ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps, sample_her_transitions), ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps, sample_her_transitions)) experiment_args = (env, memory, noise, config, normalizer, agent_id) print('singleseeding') return model, experiment_args
def init(config, agent='robot', her=False, reward_fun=None, obj_traj=None, obj_mean=None, obj_std=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] N_ENVS = config['n_envs'] def make_env(env_id, i_env, env_type='Fetch', stack_prob=None): def _f(): if env_type == 'Fetch': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range'] ) elif env_type == 'FetchStack': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range'], change_stack_order=config['change_stack_order'] ) elif env_type == 'Hand': env = gym.make(env_id, obj_action_type=config['obj_action_type']) elif env_type == 'Others': env = gym.make(env_id) #env._max_episode_steps *= config['max_nb_objects'] keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env.seed(SEED+10*i_env) if stack_prob is not None: env.unwrapped.stack_prob = stack_prob return env return _f if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Stack' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'FetchStack', config['train_stack_prob']) for i_env in range(N_ENVS)]) envs_test = SubprocVecEnv([make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob']) for i_env in range(N_ENVS)]) envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob']) for i_env in range(1)]) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions elif 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)]) envs_test = None envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)]) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME: dummy_env = gym.make(ENV_NAME, obj_action_type=config['obj_action_type']) envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Hand') for i_env in range(N_ENVS)]) envs_test = None envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Hand') for i_env in range(1)]) n_rob_actions = 20 n_actions = 1 * len(config['obj_action_type']) + n_rob_actions elif 'Fetch' in ENV_NAME and 'MaRob' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)]) envs_test = None envs_render = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)]) n_rob_actions = 4 * 2 n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions else: dummy_env = gym.make(ENV_NAME) envs = SubprocVecEnv([make_env(ENV_NAME, i_env, 'Others') for i_env in range(N_ENVS)]) envs_test = None envs_render = None def make_her_reward_fun(nb_critics, use_step_reward_fun=False): def _her_reward_fun(ag_2, g, info): # vectorized goal_len = ag_2.shape[1]//nb_critics rew = dummy_env.compute_reward(achieved_goal=ag_2[:,0:goal_len], desired_goal=g[:,0:goal_len], info=info) all_rew = rew.copy()[:, np.newaxis] for i_reward in range(1,nb_critics): if not use_step_reward_fun: obj_rew = dummy_env.compute_reward(achieved_goal=ag_2[:,goal_len*i_reward:goal_len*(i_reward+1)], desired_goal=g[:,goal_len*i_reward:goal_len*(i_reward+1)], info=info) else: goal_a = ag_2[:,goal_len*i_reward:goal_len*(i_reward+1)].reshape(-1,dummy_env.env.n_objects,3) goal_b = g[:,goal_len*i_reward:goal_len*(i_reward+1)].reshape(-1,dummy_env.env.n_objects,3) d = np.linalg.norm(goal_a - goal_b, axis=-1) #obj_rew = - (d > dummy_env.env.distance_threshold).astype(np.float32).sum(-1) obj_rew = - (d > dummy_env.env.distance_threshold).astype(np.float32) all_rew = np.concatenate((all_rew, obj_rew.copy()[:, np.newaxis]), axis=-1) #all_rew = np.concatenate((all_rew, obj_rew.copy()), axis=-1) return all_rew return _her_reward_fun her_reward_fun = make_her_reward_fun(2, config['use_step_reward_fun']) K.manual_seed(SEED) np.random.seed(SEED) observation_space = (dummy_env.observation_space.spaces['observation'].shape[1]*2 + dummy_env.observation_space.spaces['desired_goal'].shape[0]*2, dummy_env.observation_space.spaces['observation'].shape[1]*1 + dummy_env.observation_space.spaces['desired_goal'].shape[0]*2) action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions,), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions-n_rob_actions,), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions,), dtype='float32')) GAMMA = config['gamma'] clip_Q_neg = config['clip_Q_neg'] if config['clip_Q_neg'] < 0 else None TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] OUT_FUNC = K.tanh MODEL = DDPG_BD #exploration initialization noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3) config['episode_length'] = dummy_env._max_episode_steps config['observation_space'] = dummy_env.observation_space #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS, reward_fun=reward_fun, clip_Q_neg=clip_Q_neg, nb_critics=config['max_nb_objects']+1 ) model.n_objects = config['max_nb_objects'] class NormalizerObj(object): def __init__(self, mean, std): self.mean = mean self.std = std def process(self, achieved, desired): achieved_out = achieved - K.tensor(self.mean[0], dtype=achieved.dtype, device=achieved.device) achieved_out /= K.tensor(self.std[0], dtype=achieved.dtype, device=achieved.device) desired_out = desired - K.tensor(self.mean[1], dtype=desired.dtype, device=desired.device) desired_out /= K.tensor(self.std[1], dtype=desired.dtype, device=desired.device) return achieved_out, desired_out normalizer = [Normalizer(), Normalizer(), NormalizerObj(obj_mean, obj_std)] model.obj_traj = obj_traj.to('cpu') model.obj_traj.eval() for _ in range(1): state_all = dummy_env.reset() for i_step in range(config['episode_length']): model.to_cpu() obs = [K.tensor(obs, dtype=K.float32).unsqueeze(0) for obs in state_all['observation']] goal = K.tensor(state_all['desired_goal'], dtype=K.float32).unsqueeze(0) if i_step%config['objtraj_goal_horizon'] == 0: achieved_goal = K.tensor(state_all['achieved_goal'], dtype=K.float32).unsqueeze(0) objtraj_goal = [] goal_len_per_obj = goal.shape[1]//model.n_objects for i_object in range(model.n_objects): achieved_goal_per_obj = achieved_goal[:,i_object*goal_len_per_obj:(i_object+1)*goal_len_per_obj] goal_per_obj = goal[:,i_object*goal_len_per_obj:(i_object+1)*goal_len_per_obj] normed_achieved_goal_per_obj, normed_goal_per_goal = normalizer[2].process(achieved_goal_per_obj, goal_per_obj) with K.no_grad(): objtraj_goal_per_obj = model.obj_traj(normed_achieved_goal_per_obj, normed_goal_per_goal) objtraj_goal.append(objtraj_goal_per_obj) objtraj_goal = K.cat(objtraj_goal, dim=-1) # Observation normalization obs_goal = [] obs_goal.append(K.cat([obs[0], obs[1], goal, objtraj_goal], dim=-1)) if normalizer[0] is not None: obs_goal[0] = normalizer[0].preprocess_with_update(obs_goal[0]) if config['agent_alg'] == 'DDPG_BD': action = model.select_action(obs_goal[0], noise).cpu().numpy().squeeze(0) elif config['agent_alg'] == 'MADDPG_BD': action = model.select_action(obs_goal[0], noise, goal_size=goal.shape[1]).cpu().numpy().squeeze(0) action_to_env = np.zeros_like(dummy_env.action_space.sample()) action_to_env[0:action.shape[0]] = action next_state_all, _, _, _ = dummy_env.step(action_to_env) # Move to the next state state_all = next_state_all #memory initilization if her: sample_her_transitions = make_sample_her_transitions('future', 4, her_reward_fun) else: sample_her_transitions = make_sample_her_transitions('none', 4, her_reward_fun) buffer_shapes = { 'o' : (config['episode_length'], dummy_env.observation_space.spaces['observation'].shape[1]*3), 'ag' : (config['episode_length'], dummy_env.observation_space.spaces['achieved_goal'].shape[0]*2), 'g' : (config['episode_length'], dummy_env.observation_space.spaces['desired_goal'].shape[0]*2), 'u' : (config['episode_length']-1, action_space[2].shape[0]) } memory = ReplayBuffer(buffer_shapes, MEM_SIZE, config['episode_length'], sample_her_transitions) experiment_args = ((envs, envs_test, envs_render), memory, noise, config, normalizer, None) return model, experiment_args
def init(config): if config['resume'] != '': resume_path = config['resume'] saver = Saver(config) config, start_episode, save_dict = saver.resume_ckpt() config['resume'] = resume_path else: start_episode = 0 #hyperparameters ENV_NAME = config['env_id'] #'simple_spread' SEED = config['random_seed'] # 1 GAMMA = config['gamma'] # 0.95 TAU = config['tau'] # 0.01 ACTOR_LR = config['plcy_lr'] # 0.01 CRITIC_LR = config['crtc_lr'] # 0.01 MEM_SIZE = config['buffer_length'] # 1000000 REGULARIZATION = config['regularization'] # True NORMALIZED_REWARDS = config['reward_normalization'] # True if (ENV_NAME == 'FetchStackMulti-v1') or (ENV_NAME == 'FetchPushMulti-v1'): env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type']) else: env = gym.make(ENV_NAME) env.seed(SEED) if config['obj_action_type'] == 'all': n_actions = config['max_nb_objects'] * 7 + 4 elif config['obj_action_type'] == 'slide_only': n_actions = config['max_nb_objects'] * 3 + 4 elif config['obj_action_type'] == 'rotation_only': n_actions = config['max_nb_objects'] * 4 + 4 observation_space = env.observation_space.spaces['observation'].shape[ 1] + env.observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(4, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - 4, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) if env.action_space.low[0] == -1 and env.action_space.high[0] == 1: OUT_FUNC = K.tanh elif env.action_space.low[0] == 0 and env.action_space.high[0] == 1: OUT_FUNC = K.sigmoid else: OUT_FUNC = K.sigmoid K.manual_seed(SEED) np.random.seed(SEED) if config['agent_alg'] == 'MADDPG': MODEL = MADDPG elif config['agent_alg'] == 'DDPG': MODEL = DDPG elif config['agent_alg'] == 'MADDPG_R': MODEL = MADDPG_R elif config['agent_alg'] == 'MADDPG_RAE': MODEL = MADDPG_RAE if config['verbose'] > 1: # utils summaries = (Summarizer(config['dir_summary_train'], config['port'], config['resume']), Summarizer(config['dir_summary_test'], config['port'], config['resume'])) saver = Saver(config) else: summaries = None saver = None #exploration initialization noise = (Noise(action_space[0].shape[0], sigma=0.2, eps=0.3), Noise(action_space[1].shape[0], sigma=0.05, eps=0.1)) #noise = OUNoise(action_space.shape[0]) #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS) if config['resume'] != '': for i, param in enumerate(save_dict['model_params']): model.entities[i].load_state_dict(param) #memory initilization #memory = ReplayMemory(MEM_SIZE) def reward_fun(ag_2, g, info): # vectorized return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) sample_her_transitions = make_sample_her_transitions( 'future', 4, reward_fun) buffer_shapes = { 'o': (env._max_episode_steps, env.observation_space.spaces['observation'].shape[1] * 2), 'ag': (env._max_episode_steps, env.observation_space.spaces['achieved_goal'].shape[0]), 'g': (env._max_episode_steps, env.observation_space.spaces['desired_goal'].shape[0]), 'u': (env._max_episode_steps - 1, action_space[2].shape[0]) } memory = ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps, sample_her_transitions) normalizer = (Normalizer(), Normalizer()) experiment_args = (env, memory, noise, config, summaries, saver, start_episode, normalizer) return model, experiment_args
def init(config, agent='robot', her=False, reward_fun=None, obj_traj=None, obj_mean=None, obj_std=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] N_ENVS = config['n_envs'] def make_env(env_id, i_env, env_type='Fetch', stack_prob=None): def _f(): if env_type == 'Fetch': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) elif env_type == 'FetchStack': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range'], change_stack_order=config['change_stack_order']) elif env_type == 'Hand': env = gym.make(env_id, obj_action_type=config['obj_action_type']) elif env_type == 'FetchMaRobSeq': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=np.array([0.15, 0.60]), widerangeobj=True) elif env_type == 'FetchMaRobSeqTest': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range'], widerangeobj=False) elif env_type == 'Others': env = gym.make(env_id) #env._max_episode_steps *= config['max_nb_objects'] keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env.seed(SEED + 10 * i_env) if stack_prob is not None: env.unwrapped.stack_prob = stack_prob return env return _f if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Stack' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'FetchStack', config['train_stack_prob']) for i_env in range(N_ENVS) ]) envs_test = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob']) for i_env in range(N_ENVS) ]) envs_render = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'FetchStack', config['test_stack_prob']) for i_env in range(1) ]) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions elif 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)]) envs_test = None envs_render = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)]) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME: dummy_env = gym.make(ENV_NAME, obj_action_type=config['obj_action_type']) envs = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Hand') for i_env in range(N_ENVS)]) envs_test = None envs_render = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Hand') for i_env in range(1)]) n_rob_actions = 20 n_actions = 1 * len(config['obj_action_type']) + n_rob_actions elif 'Fetch' in ENV_NAME and 'MaRobLong' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(N_ENVS)]) envs_test = None envs_render = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Fetch') for i_env in range(1)]) n_rob_actions = 4 * 2 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions elif 'Fetch' in ENV_NAME and 'MaRobSeq' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'FetchMaRobSeq') for i_env in range(N_ENVS) ]) envs_test = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'FetchMaRobSeqTest') for i_env in range(N_ENVS) ]) envs_render = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'FetchMaRobSeqTest') for i_env in range(1) ]) n_rob_actions = 4 * 2 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions else: dummy_env = gym.make(ENV_NAME) envs = SubprocVecEnv( [make_env(ENV_NAME, i_env, 'Others') for i_env in range(N_ENVS)]) envs_test = None envs_render = None def her_reward_fun(ag_2, g, info): # vectorized return dummy_env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info).reshape(-1, 1) K.manual_seed(SEED) np.random.seed(SEED) observation_space = ( dummy_env.observation_space.spaces['observation'].shape[1] * 2 + dummy_env.observation_space.spaces['desired_goal'].shape[0], dummy_env.observation_space.spaces['observation'].shape[1] + dummy_env.observation_space.spaces['desired_goal'].shape[0]) action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) GAMMA = config['gamma'] clip_Q_neg = config['clip_Q_neg'] if config['clip_Q_neg'] < 0 else None TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] OUT_FUNC = K.tanh if config['agent_alg'] == 'DDPG_BD': from olbr.algorithms.ddpg_q_schedule import DDPG_BD elif config['agent_alg'] == 'MADDPG_BD': from olbr.algorithms.maddpg_q_schedule import DDPG_BD MODEL = DDPG_BD #exploration initialization noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3) config['episode_length'] = dummy_env._max_episode_steps config['observation_space'] = dummy_env.observation_space #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL( observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS, reward_fun=reward_fun, clip_Q_neg=clip_Q_neg, nb_critics=config['max_nb_objects'] #or fixing to 3 ) model.n_objects = config['max_nb_objects'] class NormalizerObj(object): def __init__(self, mean, std): self.mean = mean self.std = std def process(self, achieved, desired, step): achieved_out = achieved - K.tensor( self.mean[0], dtype=achieved.dtype, device=achieved.device) achieved_out /= K.tensor(self.std[0], dtype=achieved.dtype, device=achieved.device) desired_out = desired - K.tensor( self.mean[1], dtype=desired.dtype, device=desired.device) desired_out /= K.tensor(self.std[1], dtype=desired.dtype, device=desired.device) step_out = step - K.tensor( self.mean[2], dtype=desired.dtype, device=desired.device) step_out /= K.tensor(self.std[2], dtype=desired.dtype, device=desired.device) return achieved_out, desired_out, step_out normalizer = [Normalizer(), Normalizer(), NormalizerObj(obj_mean, obj_std)] model.obj_traj = obj_traj.to('cuda') model.obj_traj.eval() for _ in range(1): state_all = dummy_env.reset() for i_step in range(config['episode_length']): model.to_cpu() obs = [ K.tensor(obs, dtype=K.float32).unsqueeze(0) for obs in state_all['observation'] ] goal = K.tensor(state_all['desired_goal'], dtype=K.float32).unsqueeze(0) if i_step == 0: objtraj_goal = goal # Observation normalization obs_goal = [] obs_goal.append(K.cat([obs[0], obs[1], objtraj_goal], dim=-1)) if normalizer[0] is not None: obs_goal[0] = normalizer[0].preprocess_with_update(obs_goal[0]) if config['agent_alg'] == 'DDPG_BD': action = model.select_action(obs_goal[0], noise).cpu().numpy().squeeze(0) elif config['agent_alg'] == 'MADDPG_BD': action = model.select_action( obs_goal[0], noise, goal_size=goal.shape[1]).cpu().numpy().squeeze(0) action_to_env = np.zeros_like(dummy_env.action_space.sample()) action_to_env[0:action.shape[0]] = action next_state_all, _, _, _ = dummy_env.step(action_to_env) # Move to the next state state_all = next_state_all #memory initilization if her: sample_her_transitions = make_sample_her_transitions( 'future', 4, her_reward_fun) else: sample_her_transitions = make_sample_her_transitions( 'none', 4, her_reward_fun) buffer_shapes = { 'o': (config['episode_length'], dummy_env.observation_space.spaces['observation'].shape[1] * 3), 'ag': (config['episode_length'], dummy_env.observation_space.spaces['achieved_goal'].shape[0]), 'g': (config['episode_length'], dummy_env.observation_space.spaces['desired_goal'].shape[0]), 'u': (config['episode_length'] - 1, action_space[2].shape[0]) } memory = ReplayBuffer(buffer_shapes, MEM_SIZE, config['episode_length'], sample_her_transitions) experiment_args = ((envs, envs_test, envs_render), memory, noise, config, normalizer, None) print("0.20 - 0.25 - boundary_sample iff original_goal update norm") return model, experiment_args
def init(config, agent='robot', her=False, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] N_ENVS = config['n_envs'] def make_env(env_id, i_env, env_type='Fetch', ai_object=False): def _f(): if env_type == 'Fetch': env = gym.make(env_id, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) elif env_type == 'Hand': env = gym.make(env_id, obj_action_type=config['obj_action_type']) elif env_type == 'Others': env = gym.make(env_id) keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env.seed(SEED + 10 * i_env) env.unwrapped.ai_object = ai_object return env return _f if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Flex' not in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Fetch', agent == 'object') for i_env in range(N_ENVS) ]) envs_render = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Fetch', agent == 'object') for i_env in range(1) ]) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions elif 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME and 'Flex' in ENV_NAME: dummy_env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range']) envs = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Fetch', agent == 'object') for i_env in range(N_ENVS) ]) envs_render = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Fetch', agent == 'object') for i_env in range(1) ]) n_rob_actions = 4 n_actions = 2 * len(config['obj_action_type']) + n_rob_actions elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME: dummy_env = gym.make(ENV_NAME, obj_action_type=config['obj_action_type']) envs = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Hand', agent == 'object') for i_env in range(N_ENVS) ]) envs_render = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Hand', agent == 'object') for i_env in range(1) ]) n_rob_actions = 20 n_actions = 1 * len(config['obj_action_type']) + n_rob_actions else: dummy_env = gym.make(ENV_NAME) envs = SubprocVecEnv([ make_env(ENV_NAME, i_env, 'Others', agent == 'object') for i_env in range(N_ENVS) ]) envs_render = None def her_reward_fun(ag_2, g, info): # vectorized return dummy_env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) K.manual_seed(SEED) np.random.seed(SEED) observation_space = dummy_env.observation_space.spaces['observation'].shape[ 1] + dummy_env.observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) GAMMA = config['gamma'] clip_Q_neg = config['clip_Q_neg'] if config['clip_Q_neg'] < 0 else None TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] OUT_FUNC = K.tanh if config['agent_alg'] == 'DDPG_BD': MODEL = DDPG_BD from olbr.replay_buffer import ReplayBuffer from olbr.her_sampler import make_sample_her_transitions elif config['agent_alg'] == 'MADDPG_BD': MODEL = MADDPG_BD from olbr.replay_buffer import ReplayBuffer_v2 as ReplayBuffer from olbr.her_sampler import make_sample_her_transitions_v2 as make_sample_her_transitions #exploration initialization if agent == 'robot': agent_id = 0 noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3) elif agent == 'object': agent_id = 1 #noise = Noise(action_space[1].shape[0], sigma=0.2, eps=0.3) noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.2) config['episode_length'] = dummy_env._max_episode_steps config['observation_space'] = dummy_env.observation_space #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS, agent_id=agent_id, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy, reward_fun=reward_fun, clip_Q_neg=clip_Q_neg, goal_space=dummy_env.reset()['desired_goal'].shape[0]) normalizer = [Normalizer(), Normalizer()] for _ in range(1): state_all = dummy_env.reset() for _ in range(config['episode_length']): model.to_cpu() obs = [ K.tensor(obs, dtype=K.float32).unsqueeze(0) for obs in state_all['observation'] ] goal = K.tensor(state_all['desired_goal'], dtype=K.float32).unsqueeze(0) # Observation normalization obs_goal = [] obs_goal.append(K.cat([obs[agent_id], goal], dim=-1)) if normalizer[agent_id] is not None: obs_goal[0] = normalizer[agent_id].preprocess_with_update( obs_goal[0]) action = model.select_action(obs_goal[0], noise).cpu().numpy().squeeze(0) action_to_env = np.zeros_like(dummy_env.action_space.sample()) if agent_id == 0: action_to_env[0:action.shape[0]] = action else: action_to_env[-action.shape[0]::] = action next_state_all, _, _, _ = dummy_env.step(action_to_env) # Move to the next state state_all = next_state_all #memory initilization if her: sample_her_transitions = make_sample_her_transitions( 'future', 4, her_reward_fun) else: sample_her_transitions = make_sample_her_transitions( 'none', 4, her_reward_fun) buffer_shapes = { 'o': (config['episode_length'], dummy_env.observation_space.spaces['observation'].shape[1] * 2), 'ag': (config['episode_length'], dummy_env.observation_space.spaces['achieved_goal'].shape[0]), 'g': (config['episode_length'], dummy_env.observation_space.spaces['desired_goal'].shape[0]), 'u': (config['episode_length'] - 1, action_space[2].shape[0]) } memory = ReplayBuffer(buffer_shapes, MEM_SIZE, config['episode_length'], sample_her_transitions) experiment_args = ((envs, envs_render), memory, noise, config, normalizer, agent_id) return model, experiment_args
def init(config, agent='robot', her=False, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] if (ENV_NAME == 'FetchStackMulti-v1') or (ENV_NAME == 'FetchPushMulti-v1') or ( ENV_NAME == 'FetchPickAndPlaceMulti-v1'): env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp']) else: env = gym.make(ENV_NAME) def her_reward_fun(ag_2, g, info): # vectorized return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) env.seed(SEED) K.manual_seed(SEED) np.random.seed(SEED) #if config['obj_action_type'] == 'all': # n_actions = config['max_nb_objects'] * 7 + 4 #elif config['obj_action_type'] == 'slide_only': # n_actions = config['max_nb_objects'] * 3 + 4 #elif config['obj_action_type'] == 'rotation_only': # n_actions = config['max_nb_objects'] * 4 + 4 n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + 4 observation_space = env.observation_space.spaces['observation'].shape[ 1] + env.observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(4, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - 4, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) GAMMA = config['gamma'] TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] if config['agent_alg'] == 'DDPG_BD': MODEL = DDPG_BD OUT_FUNC = K.tanh from olbr.agents.basic import Actor from olbr.replay_buffer import ReplayBuffer from olbr.her_sampler import make_sample_her_transitions elif config['agent_alg'] == 'MADDPG_BD': MODEL = MADDPG_BD OUT_FUNC = K.tanh from olbr.agents.basic import Actor from olbr.replay_buffer import ReplayBuffer_v2 as ReplayBuffer from olbr.her_sampler import make_sample_her_transitions_v2 as make_sample_her_transitions elif config['agent_alg'] == 'PPO_BD': MODEL = PPO_BD OUT_FUNC = 'linear' from olbr.agents.basic import ActorStoch as Actor from olbr.replay_buffer import RolloutStorage as ReplayBuffer #exploration initialization if agent == 'robot': agent_id = 0 if config['agent_alg'] == 'PPO_BD': noise = True else: noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3) elif agent == 'object': agent_id = 1 #noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.1) noise = Noise(action_space[1].shape[0], sigma=0.2, eps=0.3) #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss if config['agent_alg'] == 'PPO_BD': model = MODEL(observation_space, action_space, optimizer, Actor, Critic, config['clip_param'], config['ppo_epoch'], config['n_batches'], config['value_loss_coef'], config['entropy_coef'], eps=config['eps'], max_grad_norm=config['max_grad_norm'], use_clipped_value_loss=True, out_func=OUT_FUNC, discrete=False, agent_id=agent_id, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy, reward_fun=reward_fun, masked_with_r=config['masked_with_r']) else: model = MODEL(observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS, agent_id=agent_id, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy, reward_fun=reward_fun, masked_with_r=config['masked_with_r']) normalizer = [Normalizer(), Normalizer()] #memory initilization if config['agent_alg'] == 'PPO_BD': memory = ReplayBuffer(env._max_episode_steps - 1, config['n_rollouts'], (observation_space, ), action_space[0]) else: if her: sample_her_transitions = make_sample_her_transitions( 'future', 4, her_reward_fun) else: sample_her_transitions = make_sample_her_transitions( 'none', 4, her_reward_fun) buffer_shapes = { 'o': (env._max_episode_steps, env.observation_space.spaces['observation'].shape[1] * 2), 'ag': (env._max_episode_steps, env.observation_space.spaces['achieved_goal'].shape[0]), 'g': (env._max_episode_steps, env.observation_space.spaces['desired_goal'].shape[0]), 'u': (env._max_episode_steps - 1, action_space[2].shape[0]), 'r': (env._max_episode_steps - 1, 1) } memory = ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps, sample_her_transitions) experiment_args = (env, memory, noise, config, normalizer, agent_id) print('clipped between -1 and 0, and masked with abs(r), and + r') return model, experiment_args
def init(config, agent='robot', her=False, object_Qfunc=None, backward_dyn=None, object_policy=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] if (ENV_NAME == 'FetchStackMulti-v1') or (ENV_NAME == 'FetchPushMulti-v1'): env = gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type']) else: env = gym.make(ENV_NAME) def reward_fun(ag_2, g, info): # vectorized return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) env.seed(SEED) K.manual_seed(SEED) np.random.seed(SEED) if config['obj_action_type'] == 'all': n_actions = config['max_nb_objects'] * 7 + 4 elif config['obj_action_type'] == 'slide_only': n_actions = config['max_nb_objects'] * 3 + 4 elif config['obj_action_type'] == 'rotation_only': n_actions = config['max_nb_objects'] * 4 + 4 observation_space = env.observation_space.spaces['observation'].shape[ 1] + env.observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(4, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - 4, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) GAMMA = config['gamma'] TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] OUT_FUNC = K.tanh MODEL = DDPG_BD #exploration initialization if agent == 'robot': agent_id = 0 noise = Noise(action_space[0].shape[0], sigma=0.2, eps=0.3) elif agent == 'object': agent_id = 1 noise = Noise(action_space[1].shape[0], sigma=0.05, eps=0.1) #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, loss_func, GAMMA, TAU, out_func=OUT_FUNC, discrete=False, regularization=REGULARIZATION, normalized_rewards=NORMALIZED_REWARDS, agent_id=agent_id, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy) normalizer = [Normalizer(), Normalizer()] #memory initilization if her: sample_her_transitions = make_sample_her_transitions( 'future', 4, reward_fun) else: sample_her_transitions = make_sample_her_transitions( 'none', 4, reward_fun) buffer_shapes = { 'o': (env._max_episode_steps, env.observation_space.spaces['observation'].shape[1] * 2), 'ag': (env._max_episode_steps, env.observation_space.spaces['achieved_goal'].shape[0]), 'g': (env._max_episode_steps, env.observation_space.spaces['desired_goal'].shape[0]), 'u': (env._max_episode_steps - 1, action_space[2].shape[0]) } memory = ReplayBuffer(buffer_shapes, MEM_SIZE, env._max_episode_steps, sample_her_transitions) experiment_args = (env, memory, noise, config, normalizer, agent_id) return model, experiment_args
def init(config, agent='robot', her=False, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None, rnd_models=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] N_ENVS = config['n_envs'] env = [] if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME: for i_env in range(N_ENVS): env.append(gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range'])) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len(config['obj_action_type']) + n_rob_actions elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME: for i_env in range(N_ENVS): env.append(gym.make(ENV_NAME, obj_action_type=config['obj_action_type'])) n_rob_actions = 20 n_actions = 1 * len(config['obj_action_type']) + n_rob_actions else: for i_env in range(N_ENVS): env.append(gym.make(ENV_NAME)) for i_env in range(N_ENVS): env[i_env].seed(SEED+10*i_env) K.manual_seed(SEED) np.random.seed(SEED) observation_space = env[0].observation_space.spaces['observation'].shape[1] + env[0].observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions,), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions-n_rob_actions,), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions,), dtype='float32')) GAMMA = config['gamma'] TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] MODEL = PPO_BD OUT_FUNC = 'linear' #exploration initialization noise = True env[0]._max_episode_steps *= config['max_nb_objects'] #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR)) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, config['clip_param'], config['ppo_epoch'], config['n_batches'], config['value_loss_coef'], config['entropy_coef'], eps=config['eps'], max_grad_norm=config['max_grad_norm'], use_clipped_value_loss=True, out_func=OUT_FUNC, discrete=False, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy, reward_fun=reward_fun, masked_with_r=config['masked_with_r'], rnd_models=rnd_models, pred_th=config['pred_th']) normalizer = [Normalizer(), Normalizer()] #memory initilization memory = ReplayBuffer(env[0]._max_episode_steps-1, config['n_rollouts'], (observation_space,), action_space[0]) running_rintr_mean = RunningMean() experiment_args = (env, memory, noise, config, normalizer, running_rintr_mean) return model, experiment_args
def init(config, agent='robot', her=False, object_Qfunc=None, backward_dyn=None, object_policy=None, reward_fun=None): #hyperparameters ENV_NAME = config['env_id'] SEED = config['random_seed'] N_ENVS = config['n_envs'] env = [] if 'Fetch' in ENV_NAME and 'Multi' in ENV_NAME: for i_env in range(N_ENVS): env.append( gym.make(ENV_NAME, n_objects=config['max_nb_objects'], obj_action_type=config['obj_action_type'], observe_obj_grp=config['observe_obj_grp'], obj_range=config['obj_range'])) n_rob_actions = 4 n_actions = config['max_nb_objects'] * len( config['obj_action_type']) + n_rob_actions elif 'HandManipulate' in ENV_NAME and 'Multi' in ENV_NAME: for i_env in range(N_ENVS): env.append( gym.make(ENV_NAME, obj_action_type=config['obj_action_type'])) n_rob_actions = 20 n_actions = 1 * len(config['obj_action_type']) + n_rob_actions else: for i_env in range(N_ENVS): env.append(gym.make(ENV_NAME)) def her_reward_fun(ag_2, g, info): # vectorized return env[0].compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) for i_env in range(N_ENVS): env[i_env].seed(SEED + 10 * i_env) K.manual_seed(SEED) np.random.seed(SEED) observation_space = env[0].observation_space.spaces['observation'].shape[ 1] + env[0].observation_space.spaces['desired_goal'].shape[0] action_space = (gym.spaces.Box(-1., 1., shape=(n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions - n_rob_actions, ), dtype='float32'), gym.spaces.Box(-1., 1., shape=(n_actions, ), dtype='float32')) GAMMA = config['gamma'] TAU = config['tau'] ACTOR_LR = config['plcy_lr'] CRITIC_LR = config['crtc_lr'] MEM_SIZE = config['buffer_length'] REGULARIZATION = config['regularization'] NORMALIZED_REWARDS = config['reward_normalization'] if config['agent_alg'] == 'PPO_BD': MODEL = PPO_BD OUT_FUNC = 'linear' from olbr.replay_buffer import ReplayBuffer from olbr.her_sampler import make_sample_her_transitions from olbr.replay_buffer import RolloutStorage as RolloutStorage #exploration initialization env[0]._max_episode_steps *= config['max_nb_objects'] noise = (True, Noise(action_space[1].shape[0], sigma=0.2, eps=0.3)) #model initialization optimizer = (optim.Adam, (ACTOR_LR, CRITIC_LR) ) # optimiser func, (actor_lr, critic_lr) loss_func = F.mse_loss model = MODEL(observation_space, action_space, optimizer, Actor, Critic, config['clip_param'], config['ppo_epoch'], config['n_batches'], config['value_loss_coef'], config['entropy_coef'], eps=config['eps'], max_grad_norm=config['max_grad_norm'], use_clipped_value_loss=True, out_func=OUT_FUNC, discrete=False, agent_id=0, object_Qfunc=object_Qfunc, backward_dyn=backward_dyn, object_policy=object_policy, reward_fun=reward_fun, masked_with_r=config['masked_with_r']) normalizer = [Normalizer(), Normalizer()] #memory initilization if her: sample_her_transitions = make_sample_her_transitions( 'future', 4, her_reward_fun) else: sample_her_transitions = make_sample_her_transitions( 'none', 4, her_reward_fun) buffer_shapes = { 'o': (env[0]._max_episode_steps, env[0].observation_space.spaces['observation'].shape[1] * 2), 'ag': (env[0]._max_episode_steps, env[0].observation_space.spaces['achieved_goal'].shape[0]), 'g': (env[0]._max_episode_steps, env[0].observation_space.spaces['desired_goal'].shape[0]), 'u': (env[0]._max_episode_steps - 1, action_space[2].shape[0]) } memory = (RolloutStorage(env[0]._max_episode_steps - 1, config['n_rollouts'], (observation_space, ), action_space[0]), ReplayBuffer(buffer_shapes, MEM_SIZE, env[0]._max_episode_steps, sample_her_transitions)) experiment_args = (env, memory, noise, config, normalizer, 0) return model, experiment_args