def Simulation(proxy_agent,index, return_dict, episodes, vis=False): print('starting simulation') env = RunEnv(visualize=vis) observation = env.reset(difficulty=0) rewards = np.zeros(episodes) totalreward = 0 for episode in range(0, episodes): action = env.action_space.sample() observation, reward, done, info = env.step(action) observation = np.array(observation) Preprocess = Preprocessing(observation, delta=0.01) prevState = Preprocess.GetState(observation) for i in range(1,1000): observation, reward, done, info = env.step(action) observation = np.array(observation) #means it didn't go the full simulation if done and i < 1000: reward = 0 state = Preprocess.GetState(observation) s,a,r,sp = Preprocess.ConvertToTensor(prevState,action, reward, state) totalreward += reward if done: env.reset(difficulty = 0, seed = None) #resets the environment if done is true print("reseting environment" + str(episode)) rewards[episode] = totalreward totalreward = 0 break action = proxy_agent(Variable(s, volatile=True)) action = action.data.numpy() prevState = state; return_dict[index] = np.sum(rewards) / episodes return np.sum(rewards) / episodes
def Simulation(proxy_agent, episodes, vis=False): env = RunEnv(visualize=vis) observation = env.reset(difficulty=0) memory = random.randint(1000, 2000) tau = random.uniform(0.01, .9) epsilon = random.uniform(.15, .9) target = proxy_agent.ProduceTargetActorCritic( memory, tau, epsilon ) batches = [ 16, 32, 64, 128] batchsize = batches[random.randint(0,len(batches)-1)] for episode in range(0, episodes): action = env.action_space.sample() observation, reward, done, info = env.step(action) observation = np.array(observation) Preprocess = Preprocessing(observation, delta=0.01) prevState = Preprocess.GetState(observation) if(vis): target.OUprocess(0, 0.15, 0.0) else: target.OUprocess(random.random(), 0.15,0.0) pelvis_y = 0 for i in range(1,1000): observation, reward, done, info = env.step(action) observation = np.array(observation) #means it didn't go the full simulation if i > 1: reward += (observation[2] - pelvis_y)*0.01 #penalty for pelvis going down reward = env.current_state[4] * 0.01 reward += 0.01 # small reward for still standing reward += min(0, env.current_state[22] - env.current_state[1]) * 0.1 # penalty for head behind pelvis reward -= sum([max(0.0, k - 0.1) for k in [env.current_state[7], env.current_state[10]]]) * 0.02 # penalty for straight legs if done and i < 1000: reward = 0 state = Preprocess.GetState(observation) s,a,r,sp = Preprocess.ConvertToTensor(prevState,action, reward, state) target.addToMemory(s,a,r,sp) # env.render() if done: env.reset(difficulty = 0, seed = None) #resets the environment if done is true if(target.primedToLearn()): lock.acquire() proxy_agent.PerformUpdate(batchsize, target) target.UpdateTargetNetworks(agent.getCritic(), agent.getActor()) print("saving actor") proxy_agent.saveActorCritic() print("actor saved") lock.release() print("reseting environment" + str(episode)) break action = target.selectAction(s) action = action.numpy() prevState = state;
class LearnToRunEnv(gym.Env): """Wrapping LearnToRunEnv in OpenAI Gym""" def __init__(self, visualize=False, difficulty=None): super(LearnToRunEnv, self).__init__() if difficulty == None: self.difficulty = random.randint(0,2) else: self.difficulty = difficulty self.learntorun_env = RunEnv(visualize=visualize) self.observation_space = self.learntorun_env.observation_space self.action_space = self.learntorun_env.action_space self._spec = EnvSpec("RunEnv-diff{}-v1".format(difficulty)) def _step(self, action): obs, reward, terminal, info = self.learntorun_env.step(action) return np.asarray(obs), reward, terminal, info def _reset(self): obs = self.learntorun_env.reset(difficulty=self.difficulty,\ seed=self.learntorun_seed) return np.asarray(obs) def _render(self, mode='human', close=False): #raise NotImplementedError return None def _seed(self, seed=None): self.learntorun_seed = seed def _close(self): self.learntorun_env.close()
class Environment: def __init__(self): print("Setting env...") self.env = RunEnv(visualize=False) print("Env set !") def get_state_size(self): return list(self.env.observation_space.shape) def get_action_size(self): return self.env.action_space.shape[0] def get_bounds(self): return self.env.action_space.low, self.env.action_space.high def set_render(self, render): self.env = RunEnv(visualize=render) def reset(self): return self.env.reset(difficulty=0) def random(self): return self.env.action_space.sample() def act(self, action): return self.env.step(action) def close(self): self.env.close()
class Environment: def __init__(self): self.env = RunEnv(visualize=False) print() self.render = False def get_state_size(self): return list(self.env.observation_space.shape) def get_action_size(self): return self.env.action_space.shape[0] def get_bounds(self): return self.env.action_space.low, self.env.action_space.high def set_render(self, render): visu = render and DISPLAY if visu != self.render: self.render = visu self.env = RunEnv(visualize=visu) self.reset() def reset(self): return np.asarray(self.env.reset(difficulty=0)) def random(self): return self.env.action_space.sample() def act(self, action): s_, r, d, i = self.env.step(action) return np.asarray(s_), r, d, i def close(self): self.env.close()
class LearnToRunEnv(gym.Env): """Wrapping LearnToRunEnv in OpenAI Gym""" def __init__(self, visualize=False, difficulty=None): super(LearnToRunEnv, self).__init__() if difficulty == None: self.difficulty = random.randint(0, 2) else: self.difficulty = difficulty self.learntorun_env = RunEnv(visualize=visualize) self.observation_space = self.learntorun_env.observation_space self.action_space = self.learntorun_env.action_space def _step(self, action): return self.learntorun_env.step(action) def _reset(self): return self.learntorun_env.reset(difficulty=self.difficulty,\ seed=self.learntorun_seed) def _render(self, mode='human', close=False): #raise NotImplementedError return None def _seed(self, seed=None): self.learntorun_seed = seed def _close(self): self.learntorun_env.close()
def test(): task_fn = lambda: LTR() task = task_fn() state_dim = task.env.observation_space.shape[0] action_dim = task.env.action_space.shape[0] with open('data/ddpg-model-LearningToRun.bin', 'rb') as f: model = pickle.load(f) actor = DDPGActorNet(state_dim, action_dim) actor.load_state_dict(model) logger = Logger('./log') env = RunEnv(visualize=False) state = env.reset(difficulty=0) print state done = False total_reward = 0.0 step = 0 while not done: action = actor.predict(np.stack([state]), to_numpy=True).flatten() state, reward, done, info = env.step(action) total_reward += reward step += 1 logger.histo_summary('input', actor.input, step) logger.histo_summary('act1', actor.act1, step) logger.histo_summary('act2', actor.act2, step) logger.histo_summary('pre_act3', actor.pre_act3, step) logger.histo_summary('act3', actor.act3, step) for tag, value in actor.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.numpy(), step) print total_reward print step
class OsimAdapter: def __init__(self): self.env = RunEnv(visualize=False) self.reset() def reset(self, difficulty=2): self.reward = 0 self.total_reward = 0 self.timestamp = 0. self.features = np.array( (self.env.reset(difficulty=difficulty))).reshape((1, -1)) self.last_obs = np.zeros(shape=(1, 41)) self.features = np.concatenate([self.features, self.last_obs], axis=1) self.done = False return self.features def get_action_space(self): space = [1] * 18 return space def get_observation_space(self): return 41 * 2 def step(self, actions): mean_possible = (np.array(self.env.action_space.low) + np.array(self.env.action_space.high)) / 2. actions = np.array(actions) + mean_possible actions *= (np.array(self.env.action_space.high) - np.array(self.env.action_space.low)) actions = np.clip(actions, self.env.action_space.low, self.env.action_space.high) obs, reward1, done, _ = self.env.step(actions) reward2 = 0 if not done: obs, reward2, done, _ = self.env.step(actions) self.features = np.array(obs).reshape((1, -1)) self.features = np.concatenate( [self.features, self.features - self.last_obs], axis=1) self.last_obs = np.array(obs).reshape((1, -1)) self.reward = reward1 + reward2 self.total_reward += self.reward self.done = done self.timestamp += 1 def get_total_reward(self): return self.total_reward
def test_actions(self): env = RunEnv(visualize=False) env.reset() v = env.action_space.sample() v[0] = 1.5 v[1] = -0.5 observation, reward, done, info = env.step(v) self.assertLessEqual(env.last_action[0],1.0) self.assertGreaterEqual(env.last_action[1],0.0)
def standalone_headless_isolated(pq, cq, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...', pq, cq) try: from osim.env import RunEnv # RunEnv = runenv_with_alternative_obstacle_generation_scheme() e = RunEnv(visualize=False, max_obstacles=0) # bind_alternative_pelvis_judgement(e) # use_alternative_episode_length(e) except Exception as err: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') cq.put(('error', e)) def floatify(np): return [float(np[i]) for i in range(len(np))] try: while True: msg = pq.get() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': o = e.reset(difficulty=0) cq.put(floatify(o)) elif msg[0] == 'step': o, r, d, i = e.step(msg[1]) o = floatify(o) # floatify the observation cq.put((o, r, d, i)) else: cq.close() pq.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
def standalone_headless_isolated(conn, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...', conn) try: import traceback from osim.env import RunEnv e = RunEnv(visualize=False) except Exception as e: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') conn.send(('error', e)) def floatify(np): return [float(np[i]) for i in range(len(np))] try: while True: msg = conn.recv() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': o = e.reset(difficulty=2) conn.send(floatify(o)) elif msg[0] == 'step': ordi = e.step(msg[1]) ordi[0] = floatify(ordi[0]) conn.send(ordi) else: conn.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
class OsimEnv(Env): def __init__(self, visualize=True, test=False, step_size=0.01, processor=None, timestep_limit=1000): self.visualize = visualize self._osim_env = RunEnv(visualize=visualize) self._osim_env.stepsize = step_size self._osim_env.spec.timestep_limit = timestep_limit self._osim_env.horizon = timestep_limit # self._osim_env.integration_accuracy = 1e-1 if test: self._osim_env.timestep_limit = 1000 self.processor = processor print "stepsize: " + str(self._osim_env.stepsize) def reset(self, seed=None, difficulty=2): observation = self._osim_env.reset(seed=seed, difficulty=difficulty) if self.processor: observation, reward, done, info = self.processor.process_step( observation, 0.0, False, dict()) return observation def step(self, action): if self.processor: action = self.processor.process_action(action) observation, reward, done, info = self._osim_env.step(action) if self.processor: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) return observation, reward, done, info def get_observation_dim(self): return len(self.reset()) def get_action_dim(self): nb_actions = self._osim_env.action_space.shape[0] return nb_actions # FOR PICKLING def __setstate__(self, state): self.__init__(visualize=state['visualize']) def __getstate__(self): state = {'visualize': self.visualize} return state
class LTR(BasicTask): name = 'LearningToRun' success_threshold = 2000 def __init__(self): BasicTask.__init__(self) self.env = RunEnv(visualize=False) def step(self, action): action = np.clip(action, 0, 1) next_state, reward, done, info = self.env.step(action) return np.asarray(next_state) / math.pi, reward, done, info def reset(self): state = self.env.reset(difficulty=0, seed=np.random.randint(0, 10000000)) return np.asarray(state) / math.pi
def test(args): print('start testing') ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs) np.random.seed(args.seed) for i in range(1): step = 0 state = env.reset(difficulty=2) fg = FeatureGenerator() state = fg.gen(state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) ep_reward = 0 ep_memories = [] while True: action = ddpg.select_action(list(state)) next_state, reward, done, info = env.step(action.tolist()) next_state = fg.gen(next_state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) print('step: {0:03d}'.format(step), end=', action: ') for act in action: print('{0:.3f}'.format(act), end=', ') print() state = next_state ep_reward += reward step += 1 print('reward:', ep_reward) if done: break print('\nEpisode: {} Reward: {}, n_steps: {}'.format( i, ep_reward, step))
def standalone_headless_isolated(conn, visualize, n_obstacles, run_logs_dir, additional_info, higher_pelvis=0.65): try: e = RunEnv(visualize=visualize, max_obstacles=n_obstacles) if higher_pelvis != 0.65: bind_alternative_pelvis_judgement(e, higher_pelvis) e = MyRunEnvLogger(e, log_dir=run_logs_dir, additional_info=additional_info) while True: msg = conn.recv() # messages should be tuples, # msg[0] should be string if msg[0] == 'reset': o = e.reset(difficulty=msg[1], seed=msg[2]) conn.send(o) elif msg[0] == 'step': ordi = e.step(msg[1]) conn.send(ordi) elif msg[0] == 'close': e.close() conn.send(None) import psutil current_process = psutil.Process() children = current_process.children(recursive=True) for child in children: child.terminate() return except Exception as e: import traceback print(traceback.format_exc()) conn.send(e)
class GameManager: def __init__(self, game_name, display): self.game_name = game_name self.display = display # self.env = gym.make(game_name) self.env = RunEnv(self.display) self.reset() def reset(self): observation = self.env.reset() return observation def step(self, action): self._update_display() observation, reward, done, info = self.env.step(action) return observation, reward, done, info def _update_display(self): # if self.display: # self.env.render() return
def main(): env = RunEnv(visualize=True) env.close() with open('save.p', 'r') as f: population = pickle.load(f) nn = population[0][0] total_reward = 0 observation = env.reset() total_reward = 0 observation = env.reset() for i in range(200): step = nn.compute(i) observation, reward, done, info = env.step(step) total_reward += reward if done: break print total_reward
def test(actor, critic, args, act_update_fn): act_fn, _, _ = act_update_fn(actor, critic, None, None, args) env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles) all_episode_metrics = [] for episode in range(args.num_episodes): episode_metrics = { "reward": 0.0, "step": 0, } observation_handler = create_observation_handler(args) action_handler = create_action_handler(args) observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)]) action = np.zeros(ACTION_SHAPE, dtype=np.float32) observation = observation_handler(observation, action) done = False while not done: print(episode_metrics["reward"]) action = act_fn(observation) observation, reward, done, _ = env.step(action_handler(action)) episode_metrics["reward"] += reward episode_metrics["step"] += 1 if done: break observation = observation_handler(observation, action) all_episode_metrics.append(episode_metrics) df = pd.DataFrame(all_episode_metrics) pprint(df.describe())
# vec=[0.3]*18 # val = 0.9 # vec[7] = val # # vec[16] = val # elif ctr < 105: # vec=[0.2]*18 # val = 0.9 # vec[9] = val # elif ctr < 115: # vec=[0.2]*18 #contract upper muscles # val = 0.9 # vec[16] = val # elif ctr < 125: # vec=[0.0]*18 # val = 0.9 # vec[0] = val # vec[9] = val # else: # vec=[0.0]*18 # print (observation) return vec ctr = 0 for i in range(1000): observation, reward, done, info = env.step(my_controller(observation, ctr)) ctr += 1 ctr = ctr % 100 print(ctr)
from osim.env import RunEnv env = RunEnv(visualize=True) observation = env.reset(difficulty=0) for i in range(200): observation, reward, done, info = env.step(env.action_space.sample()) print(reward) if done: break
# pdb.set_trace() # print max_action_steps # for i in range(max_action_steps): # # print type(my_controller(observation, i)[0]) # observation, reward, done, info = env.step(my_controller(observation, i)) total_reward = 0 # print max_action_steps # for i in range(min(max_action_steps, 500)): # # print type(my_controller(observation, i)[0]) # observation, reward, done, info = env.step(my_controller(observation, i)) # if (observation[2] < 0.65): # break # total_reward += reward # print(total_reward) i = 0 while True: observation, reward, done, info = env.step( my_controller(observation, i % max_action_steps)) total_reward += reward print('Total reward', total_reward, 'Iter', i) i += 1 if (observation[2] < 0.65): break print("Terminating") # print observation # print observation
def train(rank, args, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [0] * 41 last_v = [0] * 10 #last_state = numpy.zeros(48) env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) #running_state = ZFilter((num_inputs,), clip=5) start_time = time.time() for i_episode in range(args.start_epoch + 1, 999999): #print(shared_obs_stats.n[0]) #print('hei') #if rank == 0: # print(running_state.rs._n) signal_init = traffic_light.get() memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) #state = numpy.array(state) last_state, last_v, state = process_observation( last_state, last_v, state) state = numpy.array(state) #state = running_state(state) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #print(state) #return #print(AA) #print(type(AA)) #print(type(state)) #print(AA.shape) #print(state.shape) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print(ac_net.affine1.weight) print(ac_net.affine1.weight.data) print('ERROR') #action = select_action_actor_critic(state,ac_net) #action = action.data[0].numpy() #state = state + numpy.random.rand(args.feature)*0.001 raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(action) reward += A _, A, _, _ = env.step(action) reward += A BB = numpy.append(action, action) next_state, A, done, _ = env.step(BB) reward += A #print(next_state) #last_state = process_observation(state) last_state, last_v, next_state = process_observation( last_state, last_v, next_state) next_state = numpy.array(next_state) #print(next_state) #print(next_state.shape) #return reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, ac_net, opt_ac) shared_grad_buffers.add_gradient(ac_net) counter.increment() epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print( 'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}' .format( i_episode, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch) # wait for a new signal to continue while traffic_light.get() == signal_init: pass
def standalone_headless_isolated(pq, cq, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...',pq,cq) try: import traceback from osim.env import RunEnv e = RunEnv(visualize=True,max_obstacles=0) # bind_alternative_pelvis_judgement(e) # use_alternative_episode_length(e) except Exception as e: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') # conn.send(('error',e)) # conn.put(('error',e)) cq.put(('error',e)) def floatify(n_p): return [float(n_p[i]) for i in range(len(n_p))] try: previous_o = None while True: # msg = conn.recv() # msg = conn.get() msg = pq.get() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': #or (previous_o==None and msg[0]=='step'): o = e.reset(difficulty=0) o = floatify(o) o_processed = generate_observation(o, o) previous_o = o cq.put(o_processed) elif msg[0] == 'step': actions = msg[1] o,r,d,i = e.step(np.array(actions)) o = floatify(o) # floatify the observation o_processed = generate_observation(o, previous_o) previous_o = o cq.put((o_processed, r, d, i)) elif msg[0] == 'action_space': a_s = e.action_space r_a_s = (a_s.low.tolist(), a_s.high.tolist(), a_s.shape) cq.put(r_a_s) elif msg[0] == 'observation_space': o_s = get_observation_space() r_o_s = (o_s['low'].tolist(), o_s['high'].tolist(),o_s['shape']) cq.put(r_o_s) else: cq.close() pq.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
print("Initializing new best") w_try = copy.deepcopy(w) best_reward = 0. runs = 500 unev_runs = 0 print("Baseline, run with w_best") observation = env.reset(difficulty=0) total_reward = 0.0 for i in range(500): i *= 0.01 if i > 2: i -= 2 observation, reward, done, info = env.step(input(w_best, i)) reward -= 1.38 T = 2 else: # make a step given by the controller and record the state and the reward observation, reward, done, info = env.step(input(w_first, i)) total_reward += reward if done: break best_reward = total_reward # Your reward is print("Total reward %f" % total_reward) for run in range(runs):
class WrapperEnv(): def __init__(self, game='l2r', visualize=False, max_obstacles=10, skip_count=1): self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles) self.step_count = 0 self.old_observation = None self.skip_count = 1 # skip_count # 4 self.last_x = 0 self.current_x = 0 self.observation_space_shape = (76, ) self.action_space = self.env.action_space self.difficulty = 2 def obg(self, plain_obs): # observation generator # derivatives of observations extracted here. processed_observation, self.old_observation = go(plain_obs, self.old_observation, step=self.step_count) return np.array(processed_observation) def process_action(self, action): processed_action = [(v + 1.0) / 2 for v in action] return processed_action def step(self, action): action = [float(action[i]) for i in range(len(action))] action = self.process_action(action) import math for num in action: if math.isnan(num): print('NaN met', action) raise RuntimeError('this is bullshit') sr = 0 sp = 0 o, oo = [], [] d, i = 0, 0 self.last_x = self.current_x for j in range(self.skip_count): self.step_count += 1 oo, r, d, i = self.env.step(action) self.current_x = oo[1] headx = oo[22] px = oo[1] py = oo[2] kneer = oo[7] kneel = oo[10] lean = min(0.3, max(0, px - headx - 0.15)) * 0.05 joint = sum([max(0, k - 0.1) for k in [kneer, kneel]]) * 0.03 # * 0.03 penalty = lean + joint o = self.obg(oo) sr += r sp += penalty if d is True: break res = [o, sr, d, sp] # res = [o, sr, d, i] return res def reset(self, difficulty=2): self.difficulty = difficulty self.step_count = 0 self.old_observation = None oo = self.env.reset(difficulty=difficulty) self.last_x = oo[1] self.current_x = oo[1] o = self.obg(oo) return o def seed(self, s): self.env.seed(s)
-0.12318915143209062, 0.8572259102524259, 0.8941775106918655, -0.01404221329096258, 0.2295314378679483, -0.021037075157206642, -0.681491323768328, 0.31352610722416563, -0.7920196539712908, -0.1582820172462255, 0.311412855895345, -0.10984746585998507, -0.02296411197489962, -0.00802550380398804, -0.0017461366413788204, -0.3041740263231416, 0.016811307539095512, 0.1819317051058162, 0.7530491584560023, 0.976491955750641, 0.21107478867080567, -0.014844146458944022, 0.898891834890124, 1.5194984121043213, 0.8572259102524259, 0.8941775106918655, 0.7562374616756263, 0.9883207301533766, 0.8043896105729741, -0.02441792460823137, 0.2730685561935682, 0.051075198992798276, 0.6779778525960489, 0.029056095674362847, 0.2780392718810951, 0.18824186908999707, 1, 1, 100, 0, 0 ] if visualize: manager = opensim.Manager(self.osim_model.model) manager.setInitialTime(-0.00001) manager.setFinalTime(0.0) manager.integrate(state) env = RunEnv(visualize=True) env.__init__ = types.MethodType(modified_init, env) observation = env.reset(difficulty=0) for i in range(200): observation, reward, done, info = env.step([1.0] * 18) print reward
print("Initializing new best") w_try=copy.deepcopy(w) best_reward=0. runs=500 unev_runs=0 print("Baseline, run with w_best") observation = env.reset(difficulty = 0) total_reward = 0.0 for i in range(500): i*=0.01 if i>2: i-=2 observation, reward, done, info = env.step(input(w_best,i)) T=2 else: # make a step given by the controller and record the state and the reward observation, reward, done, info = env.step(input(w_first,i)) total_reward += reward if done: break best_reward=total_reward # Your reward is print("Total reward %f" % total_reward) for run in range(runs):
# 1. find the accordingly words "env.step" # 2. find&build my controller. # 3. Does the origin model count the body impair into the reward? (check github) # 4. range & iteration. How to judge the failure. ''' If the reward is the distance, every step has a reward ? Can we know each part of the reward ? ''' from osim.env import RunEnv env = RunEnv(visualize=True) observation = env.reset(difficulty=0) for j in range(3): print("-------") for i in range(2): eas = env.action_space.sample() o, r, d, i = env.step(eas) print("eas") print(eas) print("o") print(o) print('r') print(r) print('d') print(d) print('i') print(i)
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [1] * 48 if args.render and can_save: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) ac_net.zero_grad() num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) last_state = process_observation(state) state = process_observation(state) last_state, state = transform_observation(last_state, state) state = numpy.array(state) #global last_state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #state = running_state(state) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print('ERROR') raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() BB = numpy.append(action, action) #print(BB) reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(BB) reward += A _, A, _, _ = env.step(BB) reward += A next_state, A, done, _ = env.step(BB) reward += A next_state = process_observation(next_state) last_state, next_state = transform_observation( last_state, next_state) next_state = numpy.array(next_state) reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac) #print('backpropagate:') #print(time.time()-timer) epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'. format(i_episode, reward_sum, reward_batch)) if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch)
def test(rank, args, shared_model, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = numpy.zeros(41) if args.render: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) state = numpy.array(state) #global last_state #last_state = state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning #print(t) #timer = time.time() if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(action) puts('ERROR') return #print('NN take:') #print(time.time()-timer) #print(action) #print("------------------------") #timer = time.time() if args.skip: #env.step(action) _, reward, _, _ = env.step(action) reward_sum += reward next_state, reward, done, _ = env.step(action) next_state = numpy.array(next_state) reward_sum += reward #print('env take:') #print(time.time()-timer) #timer = time.time() #last_state ,next_state = update_observation(last_state,next_state) next_state = running_state(next_state) #print(next_state[41:82]) mask = 1 if done: mask = 0 #print('update take:') #print(time.time()-timer) #timer = time.time() memory.push(state, np.array([action]), mask, next_state, reward) #print('memory take:') #print(time.time()-timer) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 #print(num_episodes) reward_batch += reward_sum #print(num_episodes) reward_batch /= num_episodes batch = memory.sample() #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac) time.sleep(60) if i_episode % args.log_interval == 0: File = open(PATH_TO_MODEL + '/record.txt', 'a+') File.write("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) File.close() #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format( # i_episode, reward_sum, reward_batch)) print("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) #print('!!!!') epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, epoch)
def test_first_obs(self): env = RunEnv(visualize=False) observation_start = env.reset() observation, reward, done, info = env.step(env.action_space.sample()) self.assertAlmostEqual(observation_start[-1], observation[-1]) self.assertAlmostEqual(observation_start[-2], observation[-2])