class Environment: def __init__(self): print("Setting env...") self.env = RunEnv(visualize=False) print("Env set !") def get_state_size(self): return list(self.env.observation_space.shape) def get_action_size(self): return self.env.action_space.shape[0] def get_bounds(self): return self.env.action_space.low, self.env.action_space.high def set_render(self, render): self.env = RunEnv(visualize=render) def reset(self): return self.env.reset(difficulty=0) def random(self): return self.env.action_space.sample() def act(self, action): return self.env.step(action) def close(self): self.env.close()
class LearnToRunEnv(gym.Env): """Wrapping LearnToRunEnv in OpenAI Gym""" def __init__(self, visualize=False, difficulty=None): super(LearnToRunEnv, self).__init__() if difficulty == None: self.difficulty = random.randint(0,2) else: self.difficulty = difficulty self.learntorun_env = RunEnv(visualize=visualize) self.observation_space = self.learntorun_env.observation_space self.action_space = self.learntorun_env.action_space self._spec = EnvSpec("RunEnv-diff{}-v1".format(difficulty)) def _step(self, action): obs, reward, terminal, info = self.learntorun_env.step(action) return np.asarray(obs), reward, terminal, info def _reset(self): obs = self.learntorun_env.reset(difficulty=self.difficulty,\ seed=self.learntorun_seed) return np.asarray(obs) def _render(self, mode='human', close=False): #raise NotImplementedError return None def _seed(self, seed=None): self.learntorun_seed = seed def _close(self): self.learntorun_env.close()
class LearnToRunEnv(gym.Env): """Wrapping LearnToRunEnv in OpenAI Gym""" def __init__(self, visualize=False, difficulty=None): super(LearnToRunEnv, self).__init__() if difficulty == None: self.difficulty = random.randint(0, 2) else: self.difficulty = difficulty self.learntorun_env = RunEnv(visualize=visualize) self.observation_space = self.learntorun_env.observation_space self.action_space = self.learntorun_env.action_space def _step(self, action): return self.learntorun_env.step(action) def _reset(self): return self.learntorun_env.reset(difficulty=self.difficulty,\ seed=self.learntorun_seed) def _render(self, mode='human', close=False): #raise NotImplementedError return None def _seed(self, seed=None): self.learntorun_seed = seed def _close(self): self.learntorun_env.close()
def __init__(self, game_name, display): self.game_name = game_name self.display = display # self.env = gym.make(game_name) self.env = RunEnv(self.display) self.reset()
class Environment: def __init__(self): self.env = RunEnv(visualize=False) print() self.render = False def get_state_size(self): return list(self.env.observation_space.shape) def get_action_size(self): return self.env.action_space.shape[0] def get_bounds(self): return self.env.action_space.low, self.env.action_space.high def set_render(self, render): visu = render and DISPLAY if visu != self.render: self.render = visu self.env = RunEnv(visualize=visu) self.reset() def reset(self): return np.asarray(self.env.reset(difficulty=0)) def random(self): return self.env.action_space.sample() def act(self, action): s_, r, d, i = self.env.step(action) return np.asarray(s_), r, d, i def close(self): self.env.close()
def test(): task_fn = lambda: LTR() task = task_fn() state_dim = task.env.observation_space.shape[0] action_dim = task.env.action_space.shape[0] with open('data/ddpg-model-LearningToRun.bin', 'rb') as f: model = pickle.load(f) actor = DDPGActorNet(state_dim, action_dim) actor.load_state_dict(model) logger = Logger('./log') env = RunEnv(visualize=False) state = env.reset(difficulty=0) print state done = False total_reward = 0.0 step = 0 while not done: action = actor.predict(np.stack([state]), to_numpy=True).flatten() state, reward, done, info = env.step(action) total_reward += reward step += 1 logger.histo_summary('input', actor.input, step) logger.histo_summary('act1', actor.act1, step) logger.histo_summary('act2', actor.act2, step) logger.histo_summary('pre_act3', actor.pre_act3, step) logger.histo_summary('act3', actor.act3, step) for tag, value in actor.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.numpy(), step) print total_reward print step
def test1(self): env = RunEnv(visualize=False) observation = env.reset() action = env.action_space.sample() action[5] = np.NaN self.assertRaises(ValueError, env.step, action)
def test1(self): env = RunEnv(visualize=False) observation = env.reset() action = env.action_space.sample() action[5] = np.NaN self.assertRaises(ValueError, env.step, action)
def run(self): self.env = RunEnv(visualize=False) self.env.reset(difficulty = 2, seed = int(time.time())) if self.monitor: self.env.monitor.start('monitor/', force=True) # tensorflow variables (same as in model.py) self.observation_size = 55+7 self.action_size = np.prod(self.env.action_space.shape) self.hidden_size = 128 weight_init = tf.random_uniform_initializer(-0.05, 0.05) bias_init = tf.constant_initializer(0) # tensorflow model of the policy self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.debug = tf.constant([2,2]) with tf.variable_scope("policy-a"): h1 = fully_connected(self.obs, self.observation_size, self.hidden_size, weight_init, bias_init, "policy_h1") h1 = tf.nn.relu(h1) h2 = fully_connected(h1, self.hidden_size, self.hidden_size, weight_init, bias_init, "policy_h2") h2 = tf.nn.relu(h2) h3 = fully_connected(h2, self.hidden_size, self.action_size, weight_init, bias_init, "policy_h3_1") h3 = tf.nn.tanh(h3,name="policy_h3") action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, self.action_size)).astype(np.float32), name="policy_logstd") self.action_dist_mu = h3 self.action_dist_logstd = tf.tile(action_dist_logstd_param, tf.stack((tf.shape(self.action_dist_mu)[0], 1))) config = tf.ConfigProto( device_count = {'CPU': 0} ) self.session = tf.Session() self.session.run(tf.initialize_all_variables()) var_list = tf.trainable_variables() self.set_policy = SetPolicyWeights(self.session, var_list) while True: # get a task, or wait until it gets one next_task = self.task_q.get(block=True) if next_task == 1: # the task is an actor request to collect experience path = self.rollout() self.task_q.task_done() self.result_q.put(path) elif next_task == 2: print "kill message" if self.monitor: self.env.monitor.close() self.task_q.task_done() break else: # the task is to set parameters of the actor policy self.set_policy(next_task) # super hacky method to make sure when we fill the queue with set parameter tasks, # an actor doesn't finish updating before the other actors can accept their own tasks. time.sleep(0.1) self.task_q.task_done() return
def test_actions(self): env = RunEnv(visualize=False) env.reset() v = env.action_space.sample() v[0] = 1.5 v[1] = -0.5 observation, reward, done, info = env.step(v) self.assertLessEqual(env.last_action[0],1.0) self.assertGreaterEqual(env.last_action[1],0.0)
def __init__(self, visualize=False, difficulty=None): super(LearnToRunEnv, self).__init__() if difficulty == None: self.difficulty = random.randint(0, 2) else: self.difficulty = difficulty self.learntorun_env = RunEnv(visualize=visualize) self.observation_space = self.learntorun_env.observation_space self.action_space = self.learntorun_env.action_space
def __init__(self, visualize=False, token=None, max_obstacles=3): logger.info("max_obstacles={}".format(max_obstacles)) if token is None: self.remote_env = False self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles) else: self.remote_env = True self.local_env = RunEnv(visualize=False, max_obstacles=max_obstacles) self.token = token self.env = Client(GRADER_URL) self.env_created = False
def standalone_headless_isolated(pq, cq, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...', pq, cq) try: from osim.env import RunEnv # RunEnv = runenv_with_alternative_obstacle_generation_scheme() e = RunEnv(visualize=False, max_obstacles=0) # bind_alternative_pelvis_judgement(e) # use_alternative_episode_length(e) except Exception as err: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') cq.put(('error', e)) def floatify(np): return [float(np[i]) for i in range(len(np))] try: while True: msg = pq.get() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': o = e.reset(difficulty=0) cq.put(floatify(o)) elif msg[0] == 'step': o, r, d, i = e.step(msg[1]) o = floatify(o) # floatify the observation cq.put((o, r, d, i)) else: cq.close() pq.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
class OsimEnv(Env): def __init__(self, visualize=True, test=False, step_size=0.01, processor=None, timestep_limit=1000): self.visualize = visualize self._osim_env = RunEnv(visualize=visualize) self._osim_env.stepsize = step_size self._osim_env.spec.timestep_limit = timestep_limit self._osim_env.horizon = timestep_limit # self._osim_env.integration_accuracy = 1e-1 if test: self._osim_env.timestep_limit = 1000 self.processor = processor print "stepsize: " + str(self._osim_env.stepsize) def reset(self, seed=None, difficulty=2): observation = self._osim_env.reset(seed=seed, difficulty=difficulty) if self.processor: observation, reward, done, info = self.processor.process_step( observation, 0.0, False, dict()) return observation def step(self, action): if self.processor: action = self.processor.process_action(action) observation, reward, done, info = self._osim_env.step(action) if self.processor: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) return observation, reward, done, info def get_observation_dim(self): return len(self.reset()) def get_action_dim(self): nb_actions = self._osim_env.action_space.shape[0] return nb_actions # FOR PICKLING def __setstate__(self, state): self.__init__(visualize=state['visualize']) def __getstate__(self): state = {'visualize': self.visualize} return state
def standalone_headless_isolated(conn, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...', conn) try: import traceback from osim.env import RunEnv e = RunEnv(visualize=False) except Exception as e: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') conn.send(('error', e)) def floatify(np): return [float(np[i]) for i in range(len(np))] try: while True: msg = conn.recv() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': o = e.reset(difficulty=2) conn.send(floatify(o)) elif msg[0] == 'step': ordi = e.step(msg[1]) ordi[0] = floatify(ordi[0]) conn.send(ordi) else: conn.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
def __init__(self, game='l2r', visualize=False, max_obstacles=10, skip_count=1): self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles) self.step_count = 0 self.old_observation = None self.skip_count = 1 # skip_count # 4 self.last_x = 0 self.current_x = 0 self.observation_space_shape = (76, ) self.action_space = self.env.action_space self.difficulty = 2
class LTR(BasicTask): name = 'LearningToRun' success_threshold = 2000 def __init__(self): BasicTask.__init__(self) self.env = RunEnv(visualize=False) def step(self, action): action = np.clip(action, 0, 1) next_state, reward, done, info = self.env.step(action) return np.asarray(next_state) / math.pi, reward, done, info def reset(self): state = self.env.reset(difficulty=0, seed=np.random.randint(0, 10000000)) return np.asarray(state) / math.pi
def create_env(args): env = RunEnv(visualize=True, max_obstacles=args.max_obstacles) if hasattr(args, "baseline_wrapper") or hasattr(args, "ddpg_wrapper"): env = DdpgWrapper(env, args) return env
def main(): env = RunEnv(visualize=False) population = [[NN(), 0] for _ in range(100)] generation = 0 for _ in range(2000): for i in range(len(population)): print i population[i][1] = run(population[i][0], env) population = sorted(population, key=lambda x: x[1], reverse=True) print np.mean([p[1] for p in population[:5]]) generation += 1 population = population[:50] for _ in range(20): population.append([random.choice(population[:50])[0].mutate(), 0]) for _ in range(20): nn1 = random.choice(population[:20])[0] nn2 = random.choice(population[:50])[0] population.append([nn1.crossover(nn2), 0]) for _ in range(10): population.append([NN(), 0]) with open('save.p', 'w') as f: pickle.dump(population, f)
def __init__(self, visualize=True, test=False, step_size=0.01, processor=None, timestep_limit=1000): self.visualize = visualize self._osim_env = RunEnv(visualize=visualize) self._osim_env.stepsize = step_size self._osim_env.spec.timestep_limit = timestep_limit self._osim_env.horizon = timestep_limit # self._osim_env.integration_accuracy = 1e-1 if test: self._osim_env.timestep_limit = 1000 self.processor = processor print "stepsize: " + str(self._osim_env.stepsize)
def test(): env = RunEnv(visualize=False) observation_d = env.reset(project=False) observation = process_obs_dict(observation_d) total_reward = 0 steps = 0 while True: #a = AGENT OUTPUT a, q = agent.act(observation) observation_d, reward, done, info = env.step(a, project=False) observation = process_obs_dict(observation_d) total_reward += reward steps += 1 #print(observation) print(steps, 'total reward:', total_reward) if done: break print('finished testing!')
def test(skip=4): test_env = RunEnv(visualize=True, max_obstacles=0) fast_env = FastEnv(test_env, skip) # 4 is skip factor agent.training = False agent.play(fast_env, noise_level=1e-11, episode_index=-1) agent.training = True del test_env
def test(frameskip = 1, vis = False): env = RunEnv(visualize=vis) #env.change_model(model='2D', prosthetic=True, difficulty=0, seed=None) observation_d = env.reset(project = False) #observation = process_obs_dict(observation_d) total_reward = 0 steps = 0 while True: #a = AGENT OUTPUT observation = process_obs_dict(observation_d) a, q = agent.act(observation) for _ in range(frameskip): observation_d, reward, done, info = env.step(a, project = False) #observation = process_obs_dict(observation_d) total_reward += reward steps += 1 #print(observation) print(steps, 'total reward:', total_reward) if done: break print('finished testing!')
def main(): env = RunEnv(visualize=False) s = socket.socket() s.bind(("localhost", 8000)) s.listen(10) # max number of connections while True: sc, address = s.accept() f = open("work.p", 'wb') while (True): l = sc.recv(1024) while (l): f.write(l) l = sc.recv(1024) f.close() with open('work.p', 'r') as f: nn = pickle.load(f) reward = run(nn, env) sc.send(str(reward)) sc.close() s.close()
class OsimAdapter: def __init__(self): self.env = RunEnv(visualize=False) self.reset() def reset(self, difficulty=2): self.reward = 0 self.total_reward = 0 self.timestamp = 0. self.features = np.array( (self.env.reset(difficulty=difficulty))).reshape((1, -1)) self.last_obs = np.zeros(shape=(1, 41)) self.features = np.concatenate([self.features, self.last_obs], axis=1) self.done = False return self.features def get_action_space(self): space = [1] * 18 return space def get_observation_space(self): return 41 * 2 def step(self, actions): mean_possible = (np.array(self.env.action_space.low) + np.array(self.env.action_space.high)) / 2. actions = np.array(actions) + mean_possible actions *= (np.array(self.env.action_space.high) - np.array(self.env.action_space.low)) actions = np.clip(actions, self.env.action_space.low, self.env.action_space.high) obs, reward1, done, _ = self.env.step(actions) reward2 = 0 if not done: obs, reward2, done, _ = self.env.step(actions) self.features = np.array(obs).reshape((1, -1)) self.features = np.concatenate( [self.features, self.features - self.last_obs], axis=1) self.last_obs = np.array(obs).reshape((1, -1)) self.reward = reward1 + reward2 self.total_reward += self.reward self.done = done self.timestamp += 1 def get_total_reward(self): return self.total_reward
def Simulation(proxy_agent,index, return_dict, episodes, vis=False): print('starting simulation') env = RunEnv(visualize=vis) observation = env.reset(difficulty=0) rewards = np.zeros(episodes) totalreward = 0 for episode in range(0, episodes): action = env.action_space.sample() observation, reward, done, info = env.step(action) observation = np.array(observation) Preprocess = Preprocessing(observation, delta=0.01) prevState = Preprocess.GetState(observation) for i in range(1,1000): observation, reward, done, info = env.step(action) observation = np.array(observation) #means it didn't go the full simulation if done and i < 1000: reward = 0 state = Preprocess.GetState(observation) s,a,r,sp = Preprocess.ConvertToTensor(prevState,action, reward, state) totalreward += reward if done: env.reset(difficulty = 0, seed = None) #resets the environment if done is true print("reseting environment" + str(episode)) rewards[episode] = totalreward totalreward = 0 break action = proxy_agent(Variable(s, volatile=True)) action = action.data.numpy() prevState = state; return_dict[index] = np.sum(rewards) / episodes return np.sum(rewards) / episodes
def build_model(shared_object): shared_object['env'] = RunEnv(shared_object.get('visualize',False)) model_class_name = 'models.agents.' + shared_object.get('model_class',None) log_info('importing class : {}'.format(model_class_name)) model_class = import_class(model_class_name) log_info('{} successfuly imported'.format(model_class_name)) log_info('building model') model = model_class(shared_object) return model
def test(args): print('start testing') ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs) np.random.seed(args.seed) for i in range(1): step = 0 state = env.reset(difficulty=2) fg = FeatureGenerator() state = fg.gen(state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) ep_reward = 0 ep_memories = [] while True: action = ddpg.select_action(list(state)) next_state, reward, done, info = env.step(action.tolist()) next_state = fg.gen(next_state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) print('step: {0:03d}'.format(step), end=', action: ') for act in action: print('{0:.3f}'.format(act), end=', ') print() state = next_state ep_reward += reward step += 1 print('reward:', ep_reward) if done: break print('\nEpisode: {} Reward: {}, n_steps: {}'.format( i, ep_reward, step))
def create(self, env_id, seed=None): try: if (env_id == 'osim'): from osim.env import RunEnv env = RunEnv(visualize=True) else: env = gym.make(env_id) print('making environment') if seed: env.seed(seed) except gym.error.Error: raise InvalidUsage( "Attempted to look up malformed environment ID '{}'".format( env_id)) instance_id = str(uuid.uuid4().hex)[:self.id_len] self.envs[instance_id] = env self.envs_id[instance_id] = env_id return instance_id
def Simulation(proxy_agent, episodes, vis=False): env = RunEnv(visualize=vis) observation = env.reset(difficulty=0) memory = random.randint(1000, 2000) tau = random.uniform(0.01, .9) epsilon = random.uniform(.15, .9) target = proxy_agent.ProduceTargetActorCritic( memory, tau, epsilon ) batches = [ 16, 32, 64, 128] batchsize = batches[random.randint(0,len(batches)-1)] for episode in range(0, episodes): action = env.action_space.sample() observation, reward, done, info = env.step(action) observation = np.array(observation) Preprocess = Preprocessing(observation, delta=0.01) prevState = Preprocess.GetState(observation) if(vis): target.OUprocess(0, 0.15, 0.0) else: target.OUprocess(random.random(), 0.15,0.0) pelvis_y = 0 for i in range(1,1000): observation, reward, done, info = env.step(action) observation = np.array(observation) #means it didn't go the full simulation if i > 1: reward += (observation[2] - pelvis_y)*0.01 #penalty for pelvis going down reward = env.current_state[4] * 0.01 reward += 0.01 # small reward for still standing reward += min(0, env.current_state[22] - env.current_state[1]) * 0.1 # penalty for head behind pelvis reward -= sum([max(0.0, k - 0.1) for k in [env.current_state[7], env.current_state[10]]]) * 0.02 # penalty for straight legs if done and i < 1000: reward = 0 state = Preprocess.GetState(observation) s,a,r,sp = Preprocess.ConvertToTensor(prevState,action, reward, state) target.addToMemory(s,a,r,sp) # env.render() if done: env.reset(difficulty = 0, seed = None) #resets the environment if done is true if(target.primedToLearn()): lock.acquire() proxy_agent.PerformUpdate(batchsize, target) target.UpdateTargetNetworks(agent.getCritic(), agent.getActor()) print("saving actor") proxy_agent.saveActorCritic() print("actor saved") lock.release() print("reseting environment" + str(episode)) break action = target.selectAction(s) action = action.numpy() prevState = state;
class Worker: def __init__(self,wid,diff): self.wid = wid self.env = RunEnv(visualize=False) self.dif = diff self.Actor = models.create_actor(args.feature,18) def choose_action(self,state): state = torch.from_numpy(state).unsqueeze(0) action_mean, _, action_std = self.Actor(Variable(state)) action = torch.normal(action_mean, action_std) return action def work(self,globalPPO): self.Actor.load_state_dict(globalPPO.state_dict()) while True: ep_r = 0 step_count = 0 state1,state2,state3,state = [0]*60, [0]*60, [0]*60, [0]*60 balls = [] state = self.env.reset(difficulty = self.dif) state1, state2, state3, state=process_observation(state1, state2, state3, state,balls) state = numpy.array(state) buffer_s, buffer_a, buffer_r = [], [], [] while True: if not ROLLING_EVENT.is_set(): ROLLING_EVENT.wait() self.Actor.load_state_dict(globalPPO.state_dict()) buffer_s, buffer_a, buffer_r = [], [], [] a = choose_action(state) r = 0 _,_r,_,_ = env.step(a) r += _r _,_r,_,_ = env.step(a) r += _r next_state, _r, done, _ = env.step(a) r += _r buffer_s.append(state) buffer_a.append(a) buffer_r.append(r) addball_if_new(next_state,balls) state1, state2, state3, next_state=process_observation(state1, state2, state3, next_state,balls) next_state = numpy.array(next_state) state = next_state ep_r = ep_r + r GLOBAL_UPDATE_COUNTER.set() if done == True or FILL.is_set() :
from osim.env import RunEnv import numpy as np import copy import pickle env = RunEnv(visualize=False) observation = env.reset(difficulty = 0) sin=np.sin file_Name = "w_best" array=np.array T=4 alpha=0.01 alpha_0=0.01 #TODO: we should exploit the Fourier property for which higher harmonics weights tend to decays as 1/x^n for smooth and continous functions #I initialize to 0 the weights list, 4 weights for each muscle (I compose the periodic function with 4 elements of a Fourier Series) #I define weights only for 9 periodic functions, as I assume that the legs move symmetrically in time. w=[] for i in range(9): w.append(np.array([0.,0.,0.,0.,0.,0.,0.,0.])) def output(a,T,t): # Output of a 4th degree Fourier Series of sin.