def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0): np.random.seed(seed) env = L2M2019Env(difficulty=3, visualize=vis) env.change_model(model='3D', difficulty=3) env = ForwardReward(env) env = FrameSkip(env, 4) env = ActionScale(env) env = OfficialObs(env) all_reward = [] for e in range(episode_num): episode_reward = 0.0 observation = env.reset(project=True, obs_as_dict=True) step = 0 target_change_times = 0 while True: step += 1 action = submit_model.pred_batch(observation, target_change_times) observation, reward, done, info = env.step( action, project=True, obs_as_dict=True) if info['target_changed']: target_change_times += 1 episode_reward += reward if done: break all_reward.append(episode_reward) logger.info("[episode/{}] episode_reward:{} mean_reward:{}".format(\ e, episode_reward, np.mean(all_reward)))
def __init__(self, difficulty, vel_penalty_coeff, muscle_penalty_coeff, penalty_coeff, only_first_target=False): random_seed = np.random.randint(int(1e9)) env = L2M2019Env(difficulty=difficulty, visualize=False, seed=random_seed) max_timelimit = env.time_limit env = FinalReward(env, max_timelimit=max_timelimit, vel_penalty_coeff=vel_penalty_coeff, muscle_penalty_coeff=muscle_penalty_coeff, penalty_coeff=penalty_coeff) if only_first_target: assert difficulty == 3, "argument `only_first_target` is available only in `difficulty=3`." env = FirstTarget(env) env = FrameSkip(env) env = ActionScale(env) self.env = OfficialObs(env, max_timelimit=max_timelimit)
def __init__(self, config): super(LearnToMove, self).__init__(config) self.env = L2M2019Env(visualize=bool(config['visualize']), integrator_accuracy=0.001) self.project = True # False - dict of size 14, True - dict of size 4 self.env.reset(project=self.project) self.observation_transformer = ObservationTransformer()
def __init__(self, history_len=1, frame_skip=1, reward_scale=1, reload_period=None, action_mean=None, action_std=None, visualize=False, mode="train", **params): super().__init__(visualize=visualize, mode=mode) env = L2M2019Env(**params, visualize=visualize) env = EnvNormalizer(env) self.env = env self._history_len = history_len self._frame_skip = frame_skip self._visualize = visualize self._reward_scale = reward_scale self._reload_period = reload_period or BIG_NUM self.episode = 0 self.action_mean = np.array(action_mean) \ if action_mean is not None else None self.action_std = np.array(action_std) \ if action_std is not None else None self._prepare_spaces()
class MyEnv(L2M2019Env): env = L2M2019Env(visualize=False) def reset(self, **kwargs): obs_dict = self.env.reset() return get_observation(obs_dict) def step(self, action, **kwargs): obs_dict, reward, done, info = self.env.step(action) return get_observation(obs_dict), reward, done, info
def __init__(self, it_max, ep_max): super().__init__() self.it_max = it_max self.ep_max = ep_max self.env = L2M2019Env(visualize=False, difficulty=3) # self.obs_high = np.array(self.env.observation_space.high) # self.obs_low = np.array(self.env.observation_space.low) self.stop_measure = 0 self.patience = 5
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--visualize', action='store_true') parser.add_argument('--load_data', action='store_true') parser.add_argument('--load_policy', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_expert_rollouts', type=int, default=1, help='Number of expert roll outs') parser.add_argument('--num_dagger_updates', type=int, default=20, help='Number of dagger iterations') parser.add_argument( '--rollouts_per_update', type=int, default=5, help='Number of rollouts collected per dagger iteration') parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--batch_size', type=int, default=32) mode = '2D' difficulty = 2 visualize = False seed = None sim_dt = 0.01 sim_t = 10 timstep_limit = int(round(sim_t / sim_dt)) if mode is '2D': params = np.loadtxt('./osim/control/params_2D.txt') elif mode is '3D': params = np.loadtxt('./osim/control/params_3D.txt') args = parser.parse_args() locoCtrl = OsimReflexCtrl(mode=mode, dt=sim_dt) locoCtrl.set_control_params(params) env = L2M2019Env(visualize=args.visualize, seed=seed, difficulty=difficulty) env.change_model(model=mode, difficulty=difficulty, seed=seed) env.spec.timestep_limit = timstep_limit max_steps = args.max_timesteps or env.spec.timestep_limit with tf.Session(): initialize() dagger_policy_fn = DAgger().run_dagger( env, args.load_data, args.load_policy, max_steps, args.num_expert_rollouts, args.num_dagger_updates, args.rollouts_per_update, args.epochs, args.batch_size, locoCtrl)
def __init__(self, args): self.id = rpc.get_worker_info().id if args.env_name == 'L2M2019Env': self.env = L2M2019Env(visualize=False, difficulty=args.difficulty, seed=args.seed + self.id) self.test_env = L2M2019Env(visualize=False, difficulty=args.difficulty, seed=args.seed + self.id + 999) self.obs_mean = np.array(args.obs_mean) self.obs_std = np.array(args.obs_std) else: self.env = gym.make(args.env_name) self.test_env = gym.make(args.env_name) self.env.seed(args.seed + self.id) self.test_env.seed(args.seed + self.id + 999) self.act_limit = self.env.action_space.high[0] self.done = True self.len = 0 self.args = args
def f_ind(n_gen, i_worker, params): flag_model = '2D' flag_ctrl_mode = '2D' # use 2D seed = None difficulty = 0 sim_dt = 0.01 sim_t = 20 timstep_limit = int(round(sim_t / sim_dt)) init_error = True error_count = 0 while init_error: try: locoCtrl = OsimReflexCtrl(mode=flag_ctrl_mode, dt=sim_dt) env = L2M2019Env(seed=seed, difficulty=difficulty, visualize=False) env.change_model(model=flag_model, difficulty=difficulty, seed=seed) obs_dict = env.reset(project=True, seed=seed, init_pose=init_pose, obs_as_dict=True) init_error = False except Exception as e_msg: error_count += 1 print('\ninitialization error (x{})!!!'.format(error_count)) #print(e_msg) #import pdb; pdb.set_trace() env.spec.timestep_limit = timstep_limit + 100 total_reward = 0 error_sim = 0 t = 0 while True: t += sim_dt locoCtrl.set_control_params(params) action = locoCtrl.update(obs_dict) obs_dict, reward, done, info = env.step(action, project=True, obs_as_dict=True) total_reward += reward if done: break print('\n gen#={} sim#={}: score={} time={}sec #step={}'.format( n_gen, i_worker, total_reward, t, env.footstep['n'])) return total_reward # minimization
def __init__(self, world_size, args): if args.env_name == 'L2M2019Env': env = L2M2019Env(visualize=False, difficulty=args.difficulty) obs_dim = 99 else: env = gym.make(args.env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] self.device = torch.device(args.device) self.args = args self.world_size = world_size self.actor_critic = MLPActorCritic(obs_dim, act_dim, hidden_sizes=args.hidden_sizes).to( self.device) self.replay_buffer = [ ReplayBuffer(obs_dim, act_dim, args.buffer_size) for _ in range(1, world_size) ] self.gac = GAC(self.actor_critic, self.replay_buffer, device=self.device, gamma=args.gamma, alpha_start=args.alpha_start, alpha_min=args.alpha_min, alpha_max=args.alpha_max) self.test_len = 0.0 self.test_ret = 0.0 self.ob_rrefs = [] for ob_rank in range(1, world_size): ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) self.ob_rrefs.append(remote(ob_info, Observer, args=(args, ))) self.agent_rref = RRef(self)
import time import csv import matplotlib.pyplot as plt from datetime import datetime from osim.env import L2M2019Env import sys import h5py # Show entire qtable np.set_printoptions(threshold=sys.maxsize) # Adjust max_episode_steps and episodes for different learning # Initialize Environment env_name = 'L2M2019Env' env = L2M2019Env(visualize=False) env.reset() env._max_episode_steps = 10 #set max steps per episode #env.seed(0) #set environment seed for same initial positions #np.random.seed(0) #set numpy rng to reproduce same "random" action sequence # Get State Space print("Action Space {}".format(env.action_space)) print("State Space {}".format(env.observation_space)) # Set Hyperparameters initial_lr = 1.0 #learning rate min_lr = 0.005 #min learning rate gamma = 0.8 #discount factor = balances immediate and future reward (ranges 0.8 to 0.99) epsilon = 0.05 #higher -> more exploitation, less exploration n_states = 339 #number of states
while n < num_episodes: # if render: # env.render() # time.sleep(1e-3) a = get_action(o) o, r, d, _ = env.step(a, obs_as_dict=False) o = np.array(o) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(obs_as_dict=False), 0, False, 0, 0 o = np.array(o) n += 1 if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('fpath', type=str) parser.add_argument('--len', '-l', type=int, default=0) parser.add_argument('--episodes', '-n', type=int, default=100) parser.add_argument('--render', '-r', action='store_true') parser.add_argument('--deterministic', '-d', action='store_true') parser.add_argument('--device', type=str, default='cuda:0') parser.add_argument('--difficulty', type=int, default=1) args = parser.parse_args() env = L2M2019Env(visualize=args.render, difficulty=args.difficulty) get_action = load_pytorch_policy(args.fpath, args.device, args.deterministic) run_policy(env, get_action, args.len, args.episodes, args.render)
from keras.layers import Dense, Activation, Flatten, Input, concatenate from keras.optimizers import Adam from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess import argparse # Command line parameters parser = argparse.ArgumentParser(description='Train or test neural net motor controller') parser.add_argument('--model', dest='model', action='store', default="example.h5f") parser.add_argument('--episodes', type=int, default=5) args = parser.parse_args() env = L2M2019Env(visualize=True) nb_actions = env.action_space.shape[0] # Total number of steps in training # Create networks for DDPG # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu'))
pd.DataFrame(Reward).to_csv("./results/reward.csv") actor.save("./results/") critic.save("./results/") if __name__ == '__main__': model = '3D' difficulty = 1 seed = None project = True obs_as_dict = False env = L2M2019Env(seed=seed, difficulty=difficulty, visualize=False) env.change_model(model=model, difficulty=difficulty, seed=seed) obs_dict = env.reset(project=project, seed=seed, obs_as_dict=obs_as_dict) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] lr = 0.0002 tau = 0.001 #print(env.action_space.high,env.action_space.low) #state_dim = vectorized_state.shape[0] #action_dim = int(vectorized_state.shape[0]/4) ''' print("state",vectorized_state)
res, [diff_vel_x / 5.0, diff_vel_z / 5.0, diff_vel / 5.0]) # current relative target theta target_v_x, target_v_z = obs_dict['v_tgt_field'][0][5][5], obs_dict[ 'v_tgt_field'][1][5][5] target_theta = math.atan2(target_v_z, target_v_x) diff_theta = target_theta res = np.append(res, [diff_theta / np.pi]) return res if __name__ == '__main__': from osim.env import L2M2019Env env = L2M2019Env(difficulty=3, visualize=False) env.change_model(model='3D', difficulty=3) env = ForwardReward(env) env = FrameSkip(env, 4) env = ActionScale(env) env = OfficialObs(env) observation = env.reset(project=True, obs_as_dict=True) print(observation.shape) while True: _, _, done, _ = env.step(env.action_space.sample(), project=True, obs_as_dict=True) if done: break
def main(args): if 'L2M2019Env' in args.env_name: env = L2M2019Env(visualize=False, difficulty=args.difficulty) test_env = L2M2019Env(visualize=False, difficulty=args.difficulty) else: env = gym.make(args.env_name) test_env = gym.make(args.env_name) device = torch.device(args.device) data = np.load('./official_obs_scaler.npz') obs_mean, obs_std = data['mean'], data['std'] # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) env.seed(args.seed) test_env.seed(args.seed + 999) # 2.Create actor, critic, EnvSampler() and PPO. if 'L2M2019Env' in args.env_name: obs_dim = 99 else: obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high act_low = env.action_space.low actor_critic = MLPActorCritic(obs_dim, act_dim, hidden_sizes=args.hidden_sizes).to(device) replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size) gac = GAC(actor_critic, replay_buffer, device=device, gamma=args.gamma, alpha_start=args.alpha_start, alpha_min=args.alpha_min, alpha_max=args.alpha_max) def act_encoder(y): # y = [min, max] ==> x = [-1, 1] # if args.env_name == 'L2M2019Env': # return y return (y - act_low) / (act_high - act_low) * 2.0 - 1.0 def act_decoder(x): # x = [-1, 1] ==> y = [min, max] # if args.env_name == 'L2M2019Env': # return np.abs(x) return (x + 1.0) / 2.0 * (act_high - act_low) - act_low def get_observation(env): obs = np.array(env.get_observation()[242:]) obs = (obs - obs_mean) / obs_std state_desc = env.get_state_desc() p_body = [ state_desc['body_pos']['pelvis'][0], -state_desc['body_pos']['pelvis'][2] ] v_body = [ state_desc['body_vel']['pelvis'][0], -state_desc['body_vel']['pelvis'][2] ] v_tgt = env.vtgt.get_vtgt(p_body).T return np.append(obs, v_tgt) def get_reward(env): reward = 10.0 # Reward for not falling down state_desc = env.get_state_desc() p_body = [ state_desc['body_pos']['pelvis'][0], -state_desc['body_pos']['pelvis'][2] ] v_body = [ state_desc['body_vel']['pelvis'][0], -state_desc['body_vel']['pelvis'][2] ] v_tgt = env.vtgt.get_vtgt(p_body).T vel_penalty = np.linalg.norm(v_body - v_tgt) muscle_penalty = 0 for muscle in sorted(state_desc['muscles'].keys()): muscle_penalty += np.square( state_desc['muscles'][muscle]['activation']) ret_r = reward - (vel_penalty * 3 + muscle_penalty * 1) if vel_penalty < 0.3: ret_r += 10 return ret_r # 3.Start training. def get_action(o, deterministic=False): o = torch.FloatTensor(o.reshape(1, -1)).to(device) a = actor_critic.act(o, deterministic) return a def test_agent(): test_ret, test_len = 0, 0 for j in range(args.epoch_per_test): _, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o = get_observation(test_env) while not (d or (ep_len == args.max_ep_len)): # Take deterministic actions at test time a = get_action(o, True) a = act_decoder(a) for _ in range(args.frame_skip): _, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 if d: break o = get_observation(test_env) test_ret += ep_ret test_len += ep_len return test_ret / args.epoch_per_test, test_len / args.epoch_per_test total_step = args.total_epoch * args.steps_per_epoch _, d, ep_len = env.reset(), False, 0 o = get_observation(env) for t in range(1, total_step + 1): if t <= args.start_steps: a = act_encoder(env.action_space.sample()) else: a = get_action(o, deterministic=False) a = act_decoder(a) r = 0.0 for _ in range(args.frame_skip): _, _, d, _ = env.step(a) r += get_reward(env) ep_len += 1 if d: break o2 = get_observation(env) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == args.max_ep_len else d # if not d: # new_o, new_r, new_o2 = generate_success(o, o2) # replay_buffer.store(new_o, a, new_r * args.reward_scale, new_o2, d) # Store experience to replay buffer replay_buffer.store(o, a, r * args.reward_scale, o2, d) o = o2 if d or (ep_len == args.max_ep_len): _, ep_len = env.reset(obs_as_dict=False), 0 o = get_observation(env) if t >= args.update_after and t % args.steps_per_update == 0: for _ in range(args.steps_per_update): loss_a, loss_c, alpha = gac.update(args.batch_size) gac.update_beta() print( "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}" .format(loss_a, loss_c, alpha, gac.beta)) # End of epoch handling if t >= args.update_after and t % args.steps_per_epoch == 0: test_ret, test_len = test_agent() print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format( t, test_ret, test_len)) print( "-----------------------------------------------------------") yield t, test_ret, test_len, actor_critic
1.709011708233401095e-01, # ankle flex 0 * np.pi / 180, # [left] hip adduct -5.282323914341899296e-02, # hip flex -8.041966456860847323e-01, # knee extend -1.745329251994329478e-01 ]) # ankle flex if mode is '2D': params = np.loadtxt('params_2D.txt') elif mode is '3D': params = np.loadtxt('params_3D_init.txt') locoCtrl = OsimReflexCtrl(mode=mode, dt=sim_dt) locoCtrl.set_control_params(params) env = L2M2019Env(visualize=visualize, seed=seed, difficulty=difficulty) env.change_model(model=mode, difficulty=difficulty, seed=seed) obs_dict = env.reset(project=True, seed=seed, obs_as_dict=True, init_pose=INIT_POSE) env.spec.timestep_limit = timstep_limit total_reward = 0 t = 0 i = 0 # initiate onn network #onn_network = ONN(features_size=2, max_num_hidden_layers=5, # qtd_neuron_per_hidden_layer=10, n_classes=2,loss_fun = 'mse')
parser.add_argument('--seed', type=int, default=0, help='random seed for evaluation') args = parser.parse_args() # Settings remote_base = 'http://osim-rl-grader.aicrowd.com/' cgp_id = args.ind # Create environment if args.live: with open(args.token, 'r') as f: aicrowd_token = f.read().strip() client = Client(remote_base) observation = client.env_create(aicrowd_token, env_id='L2M2019Env') else: env = L2M2019Env(visualize=args.visual) observation = env.reset(seed=args.seed) # CGP controller library = build_funcLib() ind = CGP.load_from_file(cgp_id, library) l2meval = L2MEvaluator(1e8, 1) i = 0 j = 0 r_total = 0.0 while True: inputs = l2meval.get_inputs(observation) outputs = l2meval.scale_outputs(ind.run(inputs)) if args.live:
def __init__(self, visualize=False, integrator_accuracy=5e-5, frameskip=4, T=2500, action_clamp=False, difficulty=2, project=True): """ A base template for all environment wrappers. """ from osim.env import L2M2019Env self.env = L2M2019Env(visualize=visualize, integrator_accuracy=integrator_accuracy, seed=0, report=None, difficulty=difficulty) self.frameskip = frameskip self.T = T self.istep = 0 self.action_clamp = action_clamp self.project = project #Self Params self.state_dim = 169 if self.project else 228 + 72 self.action_dim = 22 self.test_size = 5 #Trackers self.shaped_reward = { 'num_footsteps': [], 'crouch_bonus': [], 'knee_bend': [], 'toes_low': [], 'x_penalty': [], 'z_penalty': [] } self.original_reward = 0.0 self.fell_down = False #Reward Shaping components self.ltoes = { 'x': [], 'y': [], 'z': [] } self.rtoes = { 'x': [], 'y': [], 'z': [] } self.ltibia = { 'x': [], 'y': [], 'z': [] } self.rtibia = { 'x': [], 'y': [], 'z': [] } self.pelvis = {'x': [], 'y': [], 'z': []} self.ltibia_angle = [] self.rtibia_angle = [] self.lfemur_angle = [] self.rfemur_angle = []
#round_n = 1 # Round 1 round_n = 3 # Round 2 if round_n == 1: difficulty = 2 # 2: Round 1; 3: Round 2 seed = None project = True obs_as_dict = True elif round_n == 2: difficulty = 3 # 2: Round 1; 3: Round 2 seed = None project = True obs_as_dict = True else: difficulty = 0 # 0: constant forward velocities; 1: consecutive sinks forward for walking seed = None project = True obs_as_dict = True #=== this is the official setting for Learn to Move 2019 ===# env = L2M2019Env(seed=seed, difficulty=difficulty) env.change_model(model=model, difficulty=difficulty, seed=seed) obs_dict = env.reset(project=project, seed=seed, obs_as_dict=obs_as_dict) while True: obs_dict, reward, done, info = env.step(env.action_space.sample(), project=project, obs_as_dict=obs_as_dict) if done: break
def __init__(self, visualization): # Create environment self.env = L2M2019Env(visualize=visualization) self.observation = self.env.reset() self.reward = 0