Esempio n. 1
0
def worker(id, sac_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size,
           explore_steps, \
           update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path):
    '''
    the function for sampling with multi-processing
    '''

    with torch.cuda.device(id % torch.cuda.device_count()):
        sac_trainer.to_cuda()
        print(
            sac_trainer, replay_buffer
        )  # sac_tainer are not the same, but all networks and optimizers in it are the same; replay  buffer is the same one.
        env = L2RunEnv(visualize=False)

        state_dim = 43
        action_dim = 18
        action_range = 1.
        frame_idx = 0
        # training loop
        for eps in range(max_episodes):
            episode_reward = 0
            state = env.reset()

            for step in range(max_steps):
                if frame_idx > explore_steps:
                    action = sac_trainer.policy_net.get_action(
                        state, deterministic=DETERMINISTIC)
                else:
                    action = sac_trainer.policy_net.sample_action()

                for _ in range(action_itr):
                    try:
                        next_state, reward, done, _ = env.step(action)
                    except KeyboardInterrupt:
                        print('Finished')
                        sac_trainer.save_model(model_path)

                    replay_buffer.push(state, action, reward, next_state, done)

                    state = next_state
                    episode_reward += reward
                    frame_idx += 1

                if replay_buffer.get_length() > batch_size:
                    for i in range(update_itr):
                        _ = sac_trainer.update(batch_size,
                                               reward_scale=10.,
                                               auto_entropy=AUTO_ENTROPY,
                                               target_entropy=-1. * action_dim)

                if eps % 10 == 0 and eps > 0:
                    sac_trainer.save_model(model_path)

                if done:
                    break
            print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ',
                  episode_reward)
            rewards_queue.put(episode_reward)

        sac_trainer.save_model(model_path)
Esempio n. 2
0
    def __init__(self, visualize, integrator_accuracy, full=False, action_repeat=5, fail_reward=-0.2, 
                 exclude_centering_frame=False):
        """
        Initialize the environment:
        Parameters:
        - full: uses as observation vector the full observation vector
        - skipFrame : How many frame to skip every action
        - exclude_centering_frame: put or not the pelvis x in obs vector (obs are centered wrt pelvis x)
        """
        env = L2RunEnv(visualize=visualize)
        env.osim_model.set_integrator_accuracy(integrator_accuracy)
        gym.Wrapper.__init__(self, env)
        env.reset()
        self.integrator_accuracy = integrator_accuracy
        self.visualize = visualize
        self.full = full
        self.env = env
        self.action_repeat = action_repeat
        self.fail_reward = fail_reward
        self.exclude_centering_frame = exclude_centering_frame
        self.env_step = 0
        if self.full:
            self.get_observation = self.get_observation_full
        else:
            self.get_observation = self.get_observation_basic

        self.observation_space = ( [0] * self.get_observation_space_size(), [0] * self.get_observation_space_size() )
        self.observation_space = convert_to_gym(self.observation_space)
Esempio n. 3
0
    def test_activations_changes(self):
        env = L2RunEnv(visualize=False)

        # Do not set new activations
        newAct = [0.9] * 18
        observation = env.reset()
        env.osim_model.set_activations(newAct)
        for i in range(5):
            withoutAct = env.osim_model.get_activations()
            observation, reward, done, info = env.step([0.5] * 18)

        # Set new activations
        newAct = [0.1] * 18
        observation = env.reset()
        env.osim_model.set_activations(newAct)
        for i in range(5):
            withAct = env.osim_model.get_activations()
            observation, reward, done, info = env.step([0.5] * 18)

        dist = np.linalg.norm(np.array(withAct) - np.array(withoutAct))

        self.assertFalse(
            dist < 1e-2,
            "Activations after 5 steps haven't changed (despite different initial conditions)"
        )
 def test_actions(self):
     env = L2RunEnv(visualize=False)
     env.reset()
     v = env.action_space.sample()
     v[0] = 1.5
     v[1] = -0.5
     observation, reward, done, info = env.step(v)
    def test_reset(self):
        env = L2RunEnv(visualize=False)
        for i in range(10):
            observation = env.reset()

        action = env.action_space.sample()
        action[5] = np.NaN
        self.assertRaises(ValueError, env.step, action)
Esempio n. 6
0
    def test_clipping(self):
        env = L2RunEnv(visualize=False)
        observation = env.reset()

        env.step(np.array([5.0] * 18))
        self.assertLessEqual( np.sum(env.osim_model.last_action), 18.1 ) 
        env.step(np.array([-1.0] * 18))
        self.assertGreaterEqual( np.sum(env.osim_model.last_action), -0.1 ) 
Esempio n. 7
0
def env(chrom):
    from osim.env import L2RunEnv as RunEnv
    e = RunEnv(visualize=False)
    e.reset()

    T = 2
    total_reward = 0
    for t in range(500):
        obs, reward, done, _ = e.step(
            controller.input(chrom.allele, T, t * 0.01))
        total_reward += reward
        if done:
            break
    # print("HEADLESS: The reward is {}".format(total_reward))

    # enables to calculate accumulated fitness
    if total_reward < 0: total_reward = 0
    del e
    return total_reward
Esempio n. 8
0
    def __init__(self,
                 reward_scale=1.,
                 frame_skip=1,
                 visualize=False,
                 reinit_random_action_every=1):
        self.reward_scale = reward_scale
        self.frame_skip = frame_skip
        self.vis = visualize
        self.reinit_random_action_every = reinit_random_action_every

        self.env = L2RunEnv(visualize=visualize)
        self.observation_shapes = [(43, )]
        self.action_size = 18
Esempio n. 9
0
    def test_activations(self):
        env = L2RunEnv(visualize=False)
        observation = env.reset()

        newact = np.array([0.0] * 18)
        env.osim_model.set_activations(newact)

        current = np.array(env.osim_model.get_activations())
        dist = np.linalg.norm(newact - current)
        self.assertTrue(dist < 0.05)

        newact = np.array([1.0] * 18)
        env.osim_model.set_activations(newact)

        current = np.array(env.osim_model.get_activations())
        dist = np.linalg.norm(newact - current)
        self.assertTrue(dist < 0.05)
Esempio n. 10
0
    def _thunk():
        info_keywords = ()
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        elif env_id.startswith("osim"):
            info_keywords = ('rb', )
            # https://github.com/stanfordnmbl/osim-rl
            _, task = env_id.split('.')
            if task == "Prosthetics":
                env = MyProstheticsEnv(integrator_accuracy=1e-4, **kwargs)
            elif task == "Arm2D":
                env = Arm2DEnv(integrator_accuracy=1e-4, **kwargs)
            else:  # task == "L2Run"
                assert task == "L2Run"
                env = L2RunEnv(integrator_accuracy=1e-4, **kwargs)
        else:
            env = gym.make(env_id)
        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)
        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if add_timestep and len(
                obs_shape) == 1 and str(env).find('TimeLimit') > -1:
            env = AddTimestep(env)

        if log_dir is not None:
            env = Monitor(env,
                          os.path.join(log_dir, str(rank)),
                          info_keywords=info_keywords,
                          allow_early_resets=allow_early_resets)

        if is_atari:
            env = wrap_deepmind(env)

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env)

        return env
Esempio n. 11
0
from osim.env import L2RunEnv
import numpy as np
from scipy.optimize import minimize, Bounds
from sklearn.linear_model import LinearRegression, SGDRegressor

DEFAULT_SEED = 20180101
rng = np.random.RandomState(DEFAULT_SEED)

env = L2RunEnv(visualize=False)
# Obtain the dimension observation space and action space
dim_obs = env.get_observation_space_size()
dim_act = env.get_action_space_size()

# Set the range of action values
action_low = env.action_space.low
action_high = env.action_space.high  # bounds of action space by env
bnds = Bounds(action_low, action_high)

# Set hyperparameters
discount = 1e-1
learning_rate = 1e-2
epsilon = 0.1
episode = 1000
batch_size = 10


class qfunction:
    # random initialization
    def __init__(self, dim_obs, dim_act, rng=None):
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
Esempio n. 12
0
def main_function(args, data):
    #### INITIALISATION DES CONSTANTES #####
    ## Model ##
    SIZE_HIDDEN_LAYER_ACTOR = data['SIZE_HIDDEN_LAYER_ACTOR'][0]
    LR_ACTOR = data['LR_ACTOR'][0]
    SIZE_HIDDEN_LAYER_CRITIC = data['SIZE_HIDDEN_LAYER_CRITIC'][0]
    LR_CRITIC = data['LR_CRITIC'][0]
    DISC_FACT = data['DISC_FACT'][0]
    TARGET_MODEL_UPDATE = data['TARGET_MODEL_UPDATE'][0]
    BATCH_SIZE = data['BATCH_SIZE'][0]
    REPLAY_BUFFER_SIZE = data['REPLAY_BUFFER_SIZE'][0]
    ## Exploration ##
    THETA = data['THETA'][0]
    SIGMA = data['SIGMA'][0]
    SIGMA_MIN = data['SIGMA_MIN'][0]
    N_STEPS_ANNEALING = data['N_STEPS_ANNEALING'][0]

    ## Acceleration ##
    ACTION_REPETITION = data['ACTION_REPETITION'][0]
    INTEGRATOR_ACCURACY = data['INTEGRATOR_ACCURACY'][0]

    # # Simulation ##
    N_STEPS_TRAIN = int(args.step)
    N_EPISODE_TEST = 100
    if args.visualize:
        N_EPISODE_TEST = 3
    VERBOSE = 1
    # 0: pas de descriptif
    # 1: descriptif toutes les LOG_INTERVAL steps
    # 2: descriptif à chaque épisode
    LOG_INTERVAL = 500

    # Save weights ##
    if not os.path.exists('weights'):
        os.mkdir('weights')
        print("Directory ", 'weights', " Created ")
    FILES_WEIGHTS_NETWORKS = './weights/' + args.model + '.h5f'

    # #### CHARGEMENT DE L'ENVIRONNEMENT #####
    if args.prosthetic:
        env = ProsContinueRewardWrapper(
            ProstheticsEnv(visualize=args.visualize,
                           integrator_accuracy=INTEGRATOR_ACCURACY))
    if not args.prosthetic:
        env = CustomDoneOsimWrapper(
            CustomRewardWrapper(
                RelativeMassCenterObservationWrapper(
                    NoObstacleObservationWrapper(
                        L2RunEnv(visualize=args.visualize,
                                 integrator_accuracy=0.005)))))

    env.reset()
    # Examine the action space ##
    action_size = env.action_space.shape[0]
    #action_size = int(env.action_space.shape[0]/2)    pour la symmétrie
    print('Size of each action:', action_size)

    # Examine the state space ##
    state_size = env.observation_space.shape[0]
    print('Size of state:', state_size)

    # #### ACTOR / CRITIC #####

    # Actor (mu) ##
    if args.prosthetic:
        input_shape = (1, env.observation_space.shape[0])
    if not args.prosthetic:
        input_shape = (1, env.observation_space.shape[0])

    observation_input = Input(shape=input_shape, name='observation_input')

    x = Flatten()(observation_input)
    x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x)
    x = Activation('relu')(x)
    x = Dense(action_size)(x)
    x = Activation('sigmoid')(x)

    actor = Model(inputs=observation_input, outputs=x)
    opti_actor = Adam(lr=LR_ACTOR)

    # Critic (Q) ##
    action_input = Input(shape=(action_size, ), name='action_input')

    x = Flatten()(observation_input)
    x = concatenate([action_input, x])
    x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)

    critic = Model(inputs=[action_input, observation_input], outputs=x)

    opti_critic = Adam(lr=LR_CRITIC)

    # #### SET UP THE AGENT #####
    # Initialize Replay Buffer ##
    memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1)

    # Random process (exploration) ##
    random_process = OrnsteinUhlenbeckProcess(
        theta=THETA,
        mu=0,
        sigma=SIGMA,
        sigma_min=SIGMA_MIN,
        size=action_size,
        n_steps_annealing=N_STEPS_ANNEALING)

    # random_process_l = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN,
    #                                           size=action_size, n_steps_annealing=N_STEPS_ANNEALING)
    # random_process_r = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN,
    #                                           size=action_size, n_steps_annealing=N_STEPS_ANNEALING)

    # Paramètres agent DDPG ##
    # agent = SymmetricDDPGAgent(nb_actions=action_size, actor=actor, critic=critic,
    #                            critic_action_input=action_input,
    #                            memory=memory, random_process_l=random_process_l, random_process_r=random_process_r,
    #                            gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE,
    #                            batch_size=BATCH_SIZE)

    agent = DDPGAgent(nb_actions=action_size,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      random_process=random_process,
                      gamma=DISC_FACT,
                      target_model_update=TARGET_MODEL_UPDATE,
                      batch_size=BATCH_SIZE)

    agent.compile(optimizer=[opti_critic, opti_actor])

    # #### TRAIN #####
    logdir = "keras_logs/" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
    robustensorboard = RobustTensorBoard(log_dir=logdir, hyperparams=data)
    saveBest = SaveBestEpisode()
    if args.train:
        if args.resume:
            agent.load_weights(FILES_WEIGHTS_NETWORKS)
        else:
            check_overwrite(args.model)

        agent.fit(env,
                  nb_steps=N_STEPS_TRAIN,
                  visualize=args.visualize,
                  verbose=VERBOSE,
                  log_interval=LOG_INTERVAL,
                  callbacks=[robustensorboard, saveBest],
                  action_repetition=ACTION_REPETITION)

        agent.save_weights(FILES_WEIGHTS_NETWORKS, overwrite=True)

    #### TEST #####
    if not args.train:
        agent.load_weights(FILES_WEIGHTS_NETWORKS)
        agent.test(env, nb_episodes=N_EPISODE_TEST, visualize=args.visualize)
Esempio n. 13
0
 def restore(self):
     # Restore the environment wrapped inside
     self.env = L2RunEnv(visualize=self.visualize)
     self.env.osim_model.set_integrator_accuracy(self.integrator_accuracy)
     self.env.reset()
Esempio n. 14
0
def worker(id, sac_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \
            update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path):
    '''
    the function for sampling with multi-processing
    '''

    print(
        sac_trainer, replay_buffer
    )  # sac_tainer are not the same, but all networks and optimizers in it are the same; replay  buffer is the same one.
    env = L2RunEnv(
        visualize=False
    )  # needs to configure different port_number for calling different Vrep env at the same time

    state_dim = 43
    action_dim = 18
    action_range = 1.

    # training loop
    for eps in range(max_episodes):
        frame_idx = 0
        rewards = []
        episode_reward = 0
        state = env.reset()

        for step in range(max_steps):
            if frame_idx > explore_steps:
                action = sac_trainer.policy_net.get_action(
                    state, deterministic=DETERMINISTIC)
            else:
                action = sac_trainer.policy_net.sample_action()

            for _ in range(action_itr):
                try:
                    next_state, reward, done, _ = env.step(action)
                except KeyboardInterrupt:
                    print('Finished')
                    sac_trainer.save_model(model_path)

                replay_buffer.push(state, action, reward, next_state, done)

                state = next_state
                episode_reward += reward
                frame_idx += 1

            # if len(replay_buffer) > batch_size:
            if replay_buffer.get_length() > batch_size:
                for i in range(update_itr):
                    _ = sac_trainer.update(batch_size,
                                           reward_scale=10.,
                                           auto_entropy=AUTO_ENTROPY,
                                           target_entropy=-1. * action_dim)

            if eps % 10 == 0 and eps > 0:
                # plot(rewards, id)
                sac_trainer.save_model(model_path)

            if done:
                break
        print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ',
              episode_reward)
        if len(rewards) == 0: rewards.append(episode_reward)
        else: rewards.append(rewards[-1] * 0.9 + episode_reward * 0.1)
        rewards_queue.put(episode_reward)

    sac_trainer.save_model(model_path)
Esempio n. 15
0
            if r is not None:
                rewards.append(0.9 * rewards[-1] +
                               0.1 * r)  # moving average of episode rewards
            else:
                break

            if len(rewards) % 20 == 0 and len(rewards) > 0:
                plot(rewards)

        [p.join() for p in processes]  # finished at the same time

        sac_trainer.save_model(model_path)

    if args.test:
        # single process for testing
        env = L2RunEnv(visualize=True)  # L2M2019Env
        sac_trainer.load_model(model_path)
        for eps in range(10):
            state = env.reset()
            episode_reward = 0

            for step in range(max_steps):
                action = sac_trainer.policy_net.get_action(
                    state, deterministic=DETERMINISTIC)
                next_state, reward, done, _ = env.step(action)

                episode_reward += reward
                state = next_state

                if done:
                    break
Esempio n. 16
0
from osim.env import L2RunEnv
import opensim

env = L2RunEnv(visualize=True)
observation = env.reset()

s = 0
for s in range(80):
    d = False

    if s == 30:
        state_old = env.osim_model.get_state()
        print("State stored")
        print(state_old)
    if s % 50 == 49:
        env.osim_model.set_state(state_old)
        print("Rollback")
        print(state_old)

    o, r, d, i = env.step(env.action_space.sample())
Esempio n. 17
0
def run(seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gymify_osim_env(L2RunEnv(visualize=True))
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gymify_osim_env(L2RunEnv(visualize=True))
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 18
0
def env_creator(env_config):
    return NoObstacleObservationWrapper(L2RunEnv(**env_config))  # return an env instance
Esempio n. 19
0
    def __init__(self, wid):
        self.wid = wid
        self.env = L2RunEnv(visualize=False)

        self.ppo = GLOBAL_PPO
Esempio n. 20
0
# loop for all individuals
# for i in pool:
# 	print("\n",i.fitness)
# 	print(i.allele)

print("Current gen {}".format(num))
chrom = pool[0]
print("fitness of best chromosome {}".format(chrom.fitness))

print(len(best_chrom))
print([i.fitness for i in best_chrom])

T = 2
from osim.env import L2RunEnv as RunEnv
e = RunEnv(visualize=True)
# e = RunEnv(visualize=False)
e.reset()
total_reward = 0
total_reward_aux = 0
for t in range(700):
    obs, reward, done, _ = e.step(controller.input(chrom.allele, T, t * 0.01))
    total_reward += reward
    if done:
        print("Done, {} steps".format(t))
        break
print(total_reward)

import matplotlib.pyplot as plt
# Best fitness
# print(best_fitness)
Esempio n. 21
0
parser.add_argument('--test', dest='train', action='store_false', default=True)
parser.add_argument('--visualize',
                    dest='visualize',
                    action='store_true',
                    default=False)
parser.add_argument('--model', dest='model', action='store', default="default")
args = parser.parse_args()

# Save models ##
if not os.path.exists('models'):
    os.mkdir('models')
    print("Directory ", 'models', " Created ")
MODELS_FOLDER_PATH = './models/' + args.model

# #### CHARGEMENT DE L'ENVIRONNEMENT #####
env = L2RunEnv(visualize=args.visualize, integrator_accuracy=0.005)

# Examine the action space ##
action_size = env.action_space.shape[0]
print('Size of each action:', action_size)
action_low = env.action_space.low
print('Action low:', action_low)
action_high = env.action_space.high
print('Action high: ', action_high)

# Examine the state space ##
state_size = env.observation_space.shape[0]
print('Size of state:', state_size)

# Redefine action_space to -1/1 (sac implementation needs a symmetric action space) #
env.action_space = ([-1.0] * env.get_action_space_size(),
def make_env(max_steps, seed):
    from osim.env import L2RunEnv  # load the env
    env = L2RunEnv(visualize=True)
    env.seed(seed)
    return Monitor(TimeLimit(env, max_steps))