def worker(id, sac_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path): ''' the function for sampling with multi-processing ''' with torch.cuda.device(id % torch.cuda.device_count()): sac_trainer.to_cuda() print( sac_trainer, replay_buffer ) # sac_tainer are not the same, but all networks and optimizers in it are the same; replay buffer is the same one. env = L2RunEnv(visualize=False) state_dim = 43 action_dim = 18 action_range = 1. frame_idx = 0 # training loop for eps in range(max_episodes): episode_reward = 0 state = env.reset() for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() for _ in range(action_itr): try: next_state, reward, done, _ = env.step(action) except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 if replay_buffer.get_length() > batch_size: for i in range(update_itr): _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1. * action_dim) if eps % 10 == 0 and eps > 0: sac_trainer.save_model(model_path) if done: break print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ', episode_reward) rewards_queue.put(episode_reward) sac_trainer.save_model(model_path)
def __init__(self, visualize, integrator_accuracy, full=False, action_repeat=5, fail_reward=-0.2, exclude_centering_frame=False): """ Initialize the environment: Parameters: - full: uses as observation vector the full observation vector - skipFrame : How many frame to skip every action - exclude_centering_frame: put or not the pelvis x in obs vector (obs are centered wrt pelvis x) """ env = L2RunEnv(visualize=visualize) env.osim_model.set_integrator_accuracy(integrator_accuracy) gym.Wrapper.__init__(self, env) env.reset() self.integrator_accuracy = integrator_accuracy self.visualize = visualize self.full = full self.env = env self.action_repeat = action_repeat self.fail_reward = fail_reward self.exclude_centering_frame = exclude_centering_frame self.env_step = 0 if self.full: self.get_observation = self.get_observation_full else: self.get_observation = self.get_observation_basic self.observation_space = ( [0] * self.get_observation_space_size(), [0] * self.get_observation_space_size() ) self.observation_space = convert_to_gym(self.observation_space)
def test_activations_changes(self): env = L2RunEnv(visualize=False) # Do not set new activations newAct = [0.9] * 18 observation = env.reset() env.osim_model.set_activations(newAct) for i in range(5): withoutAct = env.osim_model.get_activations() observation, reward, done, info = env.step([0.5] * 18) # Set new activations newAct = [0.1] * 18 observation = env.reset() env.osim_model.set_activations(newAct) for i in range(5): withAct = env.osim_model.get_activations() observation, reward, done, info = env.step([0.5] * 18) dist = np.linalg.norm(np.array(withAct) - np.array(withoutAct)) self.assertFalse( dist < 1e-2, "Activations after 5 steps haven't changed (despite different initial conditions)" )
def test_actions(self): env = L2RunEnv(visualize=False) env.reset() v = env.action_space.sample() v[0] = 1.5 v[1] = -0.5 observation, reward, done, info = env.step(v)
def test_reset(self): env = L2RunEnv(visualize=False) for i in range(10): observation = env.reset() action = env.action_space.sample() action[5] = np.NaN self.assertRaises(ValueError, env.step, action)
def test_clipping(self): env = L2RunEnv(visualize=False) observation = env.reset() env.step(np.array([5.0] * 18)) self.assertLessEqual( np.sum(env.osim_model.last_action), 18.1 ) env.step(np.array([-1.0] * 18)) self.assertGreaterEqual( np.sum(env.osim_model.last_action), -0.1 )
def env(chrom): from osim.env import L2RunEnv as RunEnv e = RunEnv(visualize=False) e.reset() T = 2 total_reward = 0 for t in range(500): obs, reward, done, _ = e.step( controller.input(chrom.allele, T, t * 0.01)) total_reward += reward if done: break # print("HEADLESS: The reward is {}".format(total_reward)) # enables to calculate accumulated fitness if total_reward < 0: total_reward = 0 del e return total_reward
def __init__(self, reward_scale=1., frame_skip=1, visualize=False, reinit_random_action_every=1): self.reward_scale = reward_scale self.frame_skip = frame_skip self.vis = visualize self.reinit_random_action_every = reinit_random_action_every self.env = L2RunEnv(visualize=visualize) self.observation_shapes = [(43, )] self.action_size = 18
def test_activations(self): env = L2RunEnv(visualize=False) observation = env.reset() newact = np.array([0.0] * 18) env.osim_model.set_activations(newact) current = np.array(env.osim_model.get_activations()) dist = np.linalg.norm(newact - current) self.assertTrue(dist < 0.05) newact = np.array([1.0] * 18) env.osim_model.set_activations(newact) current = np.array(env.osim_model.get_activations()) dist = np.linalg.norm(newact - current) self.assertTrue(dist < 0.05)
def _thunk(): info_keywords = () if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) elif env_id.startswith("osim"): info_keywords = ('rb', ) # https://github.com/stanfordnmbl/osim-rl _, task = env_id.split('.') if task == "Prosthetics": env = MyProstheticsEnv(integrator_accuracy=1e-4, **kwargs) elif task == "Arm2D": env = Arm2DEnv(integrator_accuracy=1e-4, **kwargs) else: # task == "L2Run" assert task == "L2Run" env = L2RunEnv(integrator_accuracy=1e-4, **kwargs) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, str(rank)), info_keywords=info_keywords, allow_early_resets=allow_early_resets) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
from osim.env import L2RunEnv import numpy as np from scipy.optimize import minimize, Bounds from sklearn.linear_model import LinearRegression, SGDRegressor DEFAULT_SEED = 20180101 rng = np.random.RandomState(DEFAULT_SEED) env = L2RunEnv(visualize=False) # Obtain the dimension observation space and action space dim_obs = env.get_observation_space_size() dim_act = env.get_action_space_size() # Set the range of action values action_low = env.action_space.low action_high = env.action_space.high # bounds of action space by env bnds = Bounds(action_low, action_high) # Set hyperparameters discount = 1e-1 learning_rate = 1e-2 epsilon = 0.1 episode = 1000 batch_size = 10 class qfunction: # random initialization def __init__(self, dim_obs, dim_act, rng=None): if rng is None: rng = np.random.RandomState(DEFAULT_SEED)
def main_function(args, data): #### INITIALISATION DES CONSTANTES ##### ## Model ## SIZE_HIDDEN_LAYER_ACTOR = data['SIZE_HIDDEN_LAYER_ACTOR'][0] LR_ACTOR = data['LR_ACTOR'][0] SIZE_HIDDEN_LAYER_CRITIC = data['SIZE_HIDDEN_LAYER_CRITIC'][0] LR_CRITIC = data['LR_CRITIC'][0] DISC_FACT = data['DISC_FACT'][0] TARGET_MODEL_UPDATE = data['TARGET_MODEL_UPDATE'][0] BATCH_SIZE = data['BATCH_SIZE'][0] REPLAY_BUFFER_SIZE = data['REPLAY_BUFFER_SIZE'][0] ## Exploration ## THETA = data['THETA'][0] SIGMA = data['SIGMA'][0] SIGMA_MIN = data['SIGMA_MIN'][0] N_STEPS_ANNEALING = data['N_STEPS_ANNEALING'][0] ## Acceleration ## ACTION_REPETITION = data['ACTION_REPETITION'][0] INTEGRATOR_ACCURACY = data['INTEGRATOR_ACCURACY'][0] # # Simulation ## N_STEPS_TRAIN = int(args.step) N_EPISODE_TEST = 100 if args.visualize: N_EPISODE_TEST = 3 VERBOSE = 1 # 0: pas de descriptif # 1: descriptif toutes les LOG_INTERVAL steps # 2: descriptif à chaque épisode LOG_INTERVAL = 500 # Save weights ## if not os.path.exists('weights'): os.mkdir('weights') print("Directory ", 'weights', " Created ") FILES_WEIGHTS_NETWORKS = './weights/' + args.model + '.h5f' # #### CHARGEMENT DE L'ENVIRONNEMENT ##### if args.prosthetic: env = ProsContinueRewardWrapper( ProstheticsEnv(visualize=args.visualize, integrator_accuracy=INTEGRATOR_ACCURACY)) if not args.prosthetic: env = CustomDoneOsimWrapper( CustomRewardWrapper( RelativeMassCenterObservationWrapper( NoObstacleObservationWrapper( L2RunEnv(visualize=args.visualize, integrator_accuracy=0.005))))) env.reset() # Examine the action space ## action_size = env.action_space.shape[0] #action_size = int(env.action_space.shape[0]/2) pour la symmétrie print('Size of each action:', action_size) # Examine the state space ## state_size = env.observation_space.shape[0] print('Size of state:', state_size) # #### ACTOR / CRITIC ##### # Actor (mu) ## if args.prosthetic: input_shape = (1, env.observation_space.shape[0]) if not args.prosthetic: input_shape = (1, env.observation_space.shape[0]) observation_input = Input(shape=input_shape, name='observation_input') x = Flatten()(observation_input) x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x) x = Activation('relu')(x) x = Dense(action_size)(x) x = Activation('sigmoid')(x) actor = Model(inputs=observation_input, outputs=x) opti_actor = Adam(lr=LR_ACTOR) # Critic (Q) ## action_input = Input(shape=(action_size, ), name='action_input') x = Flatten()(observation_input) x = concatenate([action_input, x]) x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) opti_critic = Adam(lr=LR_CRITIC) # #### SET UP THE AGENT ##### # Initialize Replay Buffer ## memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1) # Random process (exploration) ## random_process = OrnsteinUhlenbeckProcess( theta=THETA, mu=0, sigma=SIGMA, sigma_min=SIGMA_MIN, size=action_size, n_steps_annealing=N_STEPS_ANNEALING) # random_process_l = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN, # size=action_size, n_steps_annealing=N_STEPS_ANNEALING) # random_process_r = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN, # size=action_size, n_steps_annealing=N_STEPS_ANNEALING) # Paramètres agent DDPG ## # agent = SymmetricDDPGAgent(nb_actions=action_size, actor=actor, critic=critic, # critic_action_input=action_input, # memory=memory, random_process_l=random_process_l, random_process_r=random_process_r, # gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, # batch_size=BATCH_SIZE) agent = DDPGAgent(nb_actions=action_size, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, batch_size=BATCH_SIZE) agent.compile(optimizer=[opti_critic, opti_actor]) # #### TRAIN ##### logdir = "keras_logs/" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S") robustensorboard = RobustTensorBoard(log_dir=logdir, hyperparams=data) saveBest = SaveBestEpisode() if args.train: if args.resume: agent.load_weights(FILES_WEIGHTS_NETWORKS) else: check_overwrite(args.model) agent.fit(env, nb_steps=N_STEPS_TRAIN, visualize=args.visualize, verbose=VERBOSE, log_interval=LOG_INTERVAL, callbacks=[robustensorboard, saveBest], action_repetition=ACTION_REPETITION) agent.save_weights(FILES_WEIGHTS_NETWORKS, overwrite=True) #### TEST ##### if not args.train: agent.load_weights(FILES_WEIGHTS_NETWORKS) agent.test(env, nb_episodes=N_EPISODE_TEST, visualize=args.visualize)
def restore(self): # Restore the environment wrapped inside self.env = L2RunEnv(visualize=self.visualize) self.env.osim_model.set_integrator_accuracy(self.integrator_accuracy) self.env.reset()
def worker(id, sac_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path): ''' the function for sampling with multi-processing ''' print( sac_trainer, replay_buffer ) # sac_tainer are not the same, but all networks and optimizers in it are the same; replay buffer is the same one. env = L2RunEnv( visualize=False ) # needs to configure different port_number for calling different Vrep env at the same time state_dim = 43 action_dim = 18 action_range = 1. # training loop for eps in range(max_episodes): frame_idx = 0 rewards = [] episode_reward = 0 state = env.reset() for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() for _ in range(action_itr): try: next_state, reward, done, _ = env.step(action) except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 # if len(replay_buffer) > batch_size: if replay_buffer.get_length() > batch_size: for i in range(update_itr): _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1. * action_dim) if eps % 10 == 0 and eps > 0: # plot(rewards, id) sac_trainer.save_model(model_path) if done: break print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ', episode_reward) if len(rewards) == 0: rewards.append(episode_reward) else: rewards.append(rewards[-1] * 0.9 + episode_reward * 0.1) rewards_queue.put(episode_reward) sac_trainer.save_model(model_path)
if r is not None: rewards.append(0.9 * rewards[-1] + 0.1 * r) # moving average of episode rewards else: break if len(rewards) % 20 == 0 and len(rewards) > 0: plot(rewards) [p.join() for p in processes] # finished at the same time sac_trainer.save_model(model_path) if args.test: # single process for testing env = L2RunEnv(visualize=True) # L2M2019Env sac_trainer.load_model(model_path) for eps in range(10): state = env.reset() episode_reward = 0 for step in range(max_steps): action = sac_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC) next_state, reward, done, _ = env.step(action) episode_reward += reward state = next_state if done: break
from osim.env import L2RunEnv import opensim env = L2RunEnv(visualize=True) observation = env.reset() s = 0 for s in range(80): d = False if s == 30: state_old = env.osim_model.get_state() print("State stored") print(state_old) if s % 50 == 49: env.osim_model.set_state(state_old) print("Rollback") print(state_old) o, r, d, i = env.step(env.action_space.sample())
def run(seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gymify_osim_env(L2RunEnv(visualize=True)) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gymify_osim_env(L2RunEnv(visualize=True)) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def env_creator(env_config): return NoObstacleObservationWrapper(L2RunEnv(**env_config)) # return an env instance
def __init__(self, wid): self.wid = wid self.env = L2RunEnv(visualize=False) self.ppo = GLOBAL_PPO
# loop for all individuals # for i in pool: # print("\n",i.fitness) # print(i.allele) print("Current gen {}".format(num)) chrom = pool[0] print("fitness of best chromosome {}".format(chrom.fitness)) print(len(best_chrom)) print([i.fitness for i in best_chrom]) T = 2 from osim.env import L2RunEnv as RunEnv e = RunEnv(visualize=True) # e = RunEnv(visualize=False) e.reset() total_reward = 0 total_reward_aux = 0 for t in range(700): obs, reward, done, _ = e.step(controller.input(chrom.allele, T, t * 0.01)) total_reward += reward if done: print("Done, {} steps".format(t)) break print(total_reward) import matplotlib.pyplot as plt # Best fitness # print(best_fitness)
parser.add_argument('--test', dest='train', action='store_false', default=True) parser.add_argument('--visualize', dest='visualize', action='store_true', default=False) parser.add_argument('--model', dest='model', action='store', default="default") args = parser.parse_args() # Save models ## if not os.path.exists('models'): os.mkdir('models') print("Directory ", 'models', " Created ") MODELS_FOLDER_PATH = './models/' + args.model # #### CHARGEMENT DE L'ENVIRONNEMENT ##### env = L2RunEnv(visualize=args.visualize, integrator_accuracy=0.005) # Examine the action space ## action_size = env.action_space.shape[0] print('Size of each action:', action_size) action_low = env.action_space.low print('Action low:', action_low) action_high = env.action_space.high print('Action high: ', action_high) # Examine the state space ## state_size = env.observation_space.shape[0] print('Size of state:', state_size) # Redefine action_space to -1/1 (sac implementation needs a symmetric action space) # env.action_space = ([-1.0] * env.get_action_space_size(),
def make_env(max_steps, seed): from osim.env import L2RunEnv # load the env env = L2RunEnv(visualize=True) env.seed(seed) return Monitor(TimeLimit(env, max_steps))