def test1(self): env = RunEnv(visualize=False) observation = env.reset() action = env.action_space.sample() action[5] = np.NaN self.assertRaises(ValueError, env.step, action)
class OsimEnv(Env): def __init__(self, visualize=True, test=False, step_size=0.01, processor=None, timestep_limit=1000): self.visualize = visualize self._osim_env = RunEnv(visualize=visualize) self._osim_env.stepsize = step_size self._osim_env.spec.timestep_limit = timestep_limit self._osim_env.horizon = timestep_limit # self._osim_env.integration_accuracy = 1e-1 if test: self._osim_env.timestep_limit = 1000 self.processor = processor print "stepsize: " + str(self._osim_env.stepsize) def reset(self, seed=None, difficulty=2): observation = self._osim_env.reset(seed=seed, difficulty=difficulty) if self.processor: observation, reward, done, info = self.processor.process_step( observation, 0.0, False, dict()) return observation def step(self, action): if self.processor: action = self.processor.process_action(action) observation, reward, done, info = self._osim_env.step(action) if self.processor: observation, reward, done, info = self.processor.process_step( observation, reward, done, info) return observation, reward, done, info def get_observation_dim(self): return len(self.reset()) def get_action_dim(self): nb_actions = self._osim_env.action_space.shape[0] return nb_actions # FOR PICKLING def __setstate__(self, state): self.__init__(visualize=state['visualize']) def __getstate__(self): state = {'visualize': self.visualize} return state
def main(): env = RunEnv(visualize=True) env.close() with open('save.p', 'r') as f: population = pickle.load(f) nn = population[0][0] total_reward = 0 observation = env.reset() total_reward = 0 observation = env.reset() for i in range(200): step = nn.compute(i) observation, reward, done, info = env.step(step) total_reward += reward if done: break print total_reward
class LTR(BasicTask): name = 'LearningToRun' success_threshold = 2000 def __init__(self): BasicTask.__init__(self) self.env = RunEnv(visualize=False) def step(self, action): action = np.clip(action, 0, 1) next_state, reward, done, info = self.env.step(action) return np.asarray(next_state) / math.pi, reward, done, info def reset(self): state = self.env.reset(difficulty=0, seed=np.random.randint(0, 10000000)) return np.asarray(state) / math.pi
class OsimAdapter: def __init__(self): self.env = RunEnv(visualize=False) self.reset() def reset(self, difficulty=2): self.reward = 0 self.total_reward = 0 self.timestamp = 0. self.features = np.array( (self.env.reset(difficulty=difficulty))).reshape((1, -1)) self.last_obs = np.zeros(shape=(1, 41)) self.features = np.concatenate([self.features, self.last_obs], axis=1) self.done = False return self.features def get_action_space(self): space = [1] * 18 return space def get_observation_space(self): return 41 * 2 def step(self, actions): mean_possible = (np.array(self.env.action_space.low) + np.array(self.env.action_space.high)) / 2. actions = np.array(actions) + mean_possible actions *= (np.array(self.env.action_space.high) - np.array(self.env.action_space.low)) actions = np.clip(actions, self.env.action_space.low, self.env.action_space.high) obs, reward1, done, _ = self.env.step(actions) reward2 = 0 if not done: obs, reward2, done, _ = self.env.step(actions) self.features = np.array(obs).reshape((1, -1)) self.features = np.concatenate( [self.features, self.features - self.last_obs], axis=1) self.last_obs = np.array(obs).reshape((1, -1)) self.reward = reward1 + reward2 self.total_reward += self.reward self.done = done self.timestamp += 1 def get_total_reward(self): return self.total_reward
def test(args): print('start testing') ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs) np.random.seed(args.seed) for i in range(1): step = 0 state = env.reset(difficulty=2) fg = FeatureGenerator() state = fg.gen(state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) ep_reward = 0 ep_memories = [] while True: action = ddpg.select_action(list(state)) next_state, reward, done, info = env.step(action.tolist()) next_state = fg.gen(next_state) #obs = fg.traj[0] #print(obs.left_knee_r, obs.right_knee_r) print('step: {0:03d}'.format(step), end=', action: ') for act in action: print('{0:.3f}'.format(act), end=', ') print() state = next_state ep_reward += reward step += 1 print('reward:', ep_reward) if done: break print('\nEpisode: {} Reward: {}, n_steps: {}'.format( i, ep_reward, step))
def standalone_headless_isolated(conn, visualize, n_obstacles, run_logs_dir, additional_info, higher_pelvis=0.65): try: e = RunEnv(visualize=visualize, max_obstacles=n_obstacles) if higher_pelvis != 0.65: bind_alternative_pelvis_judgement(e, higher_pelvis) e = MyRunEnvLogger(e, log_dir=run_logs_dir, additional_info=additional_info) while True: msg = conn.recv() # messages should be tuples, # msg[0] should be string if msg[0] == 'reset': o = e.reset(difficulty=msg[1], seed=msg[2]) conn.send(o) elif msg[0] == 'step': ordi = e.step(msg[1]) conn.send(ordi) elif msg[0] == 'close': e.close() conn.send(None) import psutil current_process = psutil.Process() children = current_process.children(recursive=True) for child in children: child.terminate() return except Exception as e: import traceback print(traceback.format_exc()) conn.send(e)
class GameManager: def __init__(self, game_name, display): self.game_name = game_name self.display = display # self.env = gym.make(game_name) self.env = RunEnv(self.display) self.reset() def reset(self): observation = self.env.reset() return observation def step(self, action): self._update_display() observation, reward, done, info = self.env.step(action) return observation, reward, done, info def _update_display(self): # if self.display: # self.env.render() return
def test(actor, critic, args, act_update_fn): act_fn, _, _ = act_update_fn(actor, critic, None, None, args) env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles) all_episode_metrics = [] for episode in range(args.num_episodes): episode_metrics = { "reward": 0.0, "step": 0, } observation_handler = create_observation_handler(args) action_handler = create_action_handler(args) observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)]) action = np.zeros(ACTION_SHAPE, dtype=np.float32) observation = observation_handler(observation, action) done = False while not done: print(episode_metrics["reward"]) action = act_fn(observation) observation, reward, done, _ = env.step(action_handler(action)) episode_metrics["reward"] += reward episode_metrics["step"] += 1 if done: break observation = observation_handler(observation, action) all_episode_metrics.append(episode_metrics) df = pd.DataFrame(all_episode_metrics) pprint(df.describe())
class keras_model(object): def __init__(self, shared_object): log_info('setting keras_model main parameters') self.shared_object = shared_object self.model_class = shared_object.get('model_class', None) self.name = shared_object.get('model_name', None) self.network_name = shared_object.get('network', None) self.train_bool = shared_object.get('train', True) self.test_bool = shared_object.get('test', True) self.load_bool = not (self.train_bool) self.submit_bool = shared_object.get('submit', False) self.tokenId = shared_object.get('submit_token', None) self.visualize = shared_object.get('visualize', False) self.save_bool = shared_object.get('save', True) self.save_path = shared_object.get( 'save_path', os.path.join('model_weights', self.model_class, self.name, self.network_name + '.h5f')) self.save_folder = os.path.dirname(self.save_path) self.env = shared_object.get('env', None) if self.env == None: self.env = RunEnv(self.visualize) self.shared_object['env'] = self.env shared_object['env'] = self.env self.env.reset() self.nb_actions = self.env.action_space.shape[0] self.metrics = shared_object.get('metrics', ['mae']) self.optimizer_name = shared_object.get('optimizer', 'Adam') self.optimizer_params = shared_object.get('optimizer_params', None) log_info("setting keras_model's training parameters") self.train_parameters = {} self.train_parameters['nb_steps'] = shared_object.get('nb_steps', 0) self.train_parameters['action_repetition'] = shared_object.get( 'action_repetition', 0) self.train_parameters['callback_names'] = shared_object.get( 'callback_names', None) self.train_parameters['callbacks'] = load_callbacks( self.train_parameters['callback_names']) self.train_parameters['verbose'] = shared_object.get('verbose', 0) self.train_parameters['nb_max_start_steps'] = shared_object.get( 'nb_max_start_steps', 0) self.train_parameters['start_step_policy'] = shared_object.get( 'start_step_policy', None) self.train_parameters['log_interval'] = shared_object.get( 'log_interval', 1) self.train_parameters['nb_max_episode_steps'] = shared_object.get( 'nb_max_episode_steps', self.env.timestep_limit) log_info("setting keras_model's testing parameters") self.test_parameters = {} self.test_parameters['nb_episodes'] = shared_object.get( 'test_nb_episodes', 1) self.test_parameters['nb_max_episode_steps'] = shared_object.get( 'test_nb_max_episode_steps', 1) log_info('loading networks : {}'.format(self.network_name)) self.load_networks() log_info('loading networks done') log_info('building optimizer : {}'.format(self.optimizer_name)) self.build_optimizer() log_info('optimizer built sucessfully') log_info('building the agent') self.build_agent() log_info('agent sucessfully built') def train(self): """ # Arguments nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ callback_history = self.agent.fit( self.env, nb_steps=self.train_parameters['nb_steps'], action_repetition=self.train_parameters['action_repetition'], callbacks=self.train_parameters['callbacks'], verbose=self.train_parameters['verbose'], visualize=self.visualize, nb_max_start_steps=self.train_parameters['nb_max_start_steps'], start_step_policy=self.train_parameters['start_step_policy'], log_interval=self.train_parameters['log_interval'], nb_max_episode_steps=self.train_parameters['nb_max_episode_steps']) if self.save_bool: if not (os.pasth.exists(self.save_folder)): os.makedirs(self.save_folder) agent.save_weights(self.savePath, overwrite=True) return callback_history def load(self): log_info('loading model : {}'.format(self.name)) self.agent.load_weights(self.save_path) def test(self): self.agent.test( self.env, nb_episodes=self.test_parameters['nb_episodes'], visualize=self.test_parameters['visualize'], nb_max_episode_steps=self.test_parameters['nb_max_episode_steps']) def submit(self): remote_base = 'http://grader.crowdai.org:1729' env = RunEnv(visualize=self.visualize) client = Client(remote_base) # Create environment observation = client.env_create(self.submit_token) # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: [observation, reward, done, info] = client.env_step(self.agent.forward(observation)) if done: observation = client.env_reset() if not observation: break client.submit() def build_optimizer(self): log_info('loading optimizer class : {}'.format(self.optimizer_name)) optimizer_class = import_class('keras.optimizers.{}'.format( self.optimizer_name)) self.optimizer = optimizer_class(**self.optimizer_params) def run(self): if self.train_bool: log_info('starting to train the model...') self.train() if self.load_bool: log_info('starting to load the model...') self.load() if self.test_bool: log_info('starting testing the model ...') self.test() if self.submit_bool: log_info('starting to submit the model') self.submit() def load_networks(self): network_class = import_class('models.nn.{}.{}'.format( self.model_class, self.network_name)) self.networks = network_class(self.shared_object) def build_agent(self): raise NotImplementedError
callback=on_iteration_start, verbose=args.verbose, ) env.close() if MPI.COMM_WORLD.Get_rank() == 0: plot_history(history) save_model() if args.repeat: cmd = 'python run_osim.py --repeat --train --model %s --steps %s --size %s' % (args.model, args.steps, args.size) subprocess.call(cmd.split(' ')) if args.test: observation = env.reset() observation = preprocess(observation, step=1, verbose=args.verbose) pi = policy_fn('pi', env.observation_space, env.action_space) if not load_model(): exit(0) done = False total = 0 steps = 0 while not done: action = pi.act(True, observation)[0] observation, reward, done, info = env.step(action) if args.visualize: vis.pointCameraAt(opensim.Vec3(observation[1], 0, 0), opensim.Vec3(0, 1, 0)) observation = preprocess(observation, step=steps + 2, verbose=args.verbose)
''' Script to print observation values of osim Running Environment ''' import numpy as np from envs.diffEnv import diffEnv from osim.env import RunEnv env = RunEnv(max_obstacles = 10) obs = env.reset(difficulty = 2, seed=47) def print_obs(obs): #iterate = range(len(obs)) iterate = [29,31,33,35] for i in iterate: print (str(i) + ": " + str(obs[i])) try: print_obs(obs) for t in range(200): action= np.zeros(18) obs, _, done, _ = env.step(action) print("") print_obs(obs) if done: break while True: pass except KeyboardInterrupt: pass
def test(rank, args, shared_model, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = numpy.zeros(41) if args.render: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) state = numpy.array(state) #global last_state #last_state = state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning #print(t) #timer = time.time() if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(action) puts('ERROR') return #print('NN take:') #print(time.time()-timer) #print(action) #print("------------------------") #timer = time.time() if args.skip: #env.step(action) _, reward, _, _ = env.step(action) reward_sum += reward next_state, reward, done, _ = env.step(action) next_state = numpy.array(next_state) reward_sum += reward #print('env take:') #print(time.time()-timer) #timer = time.time() #last_state ,next_state = update_observation(last_state,next_state) next_state = running_state(next_state) #print(next_state[41:82]) mask = 1 if done: mask = 0 #print('update take:') #print(time.time()-timer) #timer = time.time() memory.push(state, np.array([action]), mask, next_state, reward) #print('memory take:') #print(time.time()-timer) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 #print(num_episodes) reward_batch += reward_sum #print(num_episodes) reward_batch /= num_episodes batch = memory.sample() #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac) time.sleep(60) if i_episode % args.log_interval == 0: File = open(PATH_TO_MODEL + '/record.txt', 'a+') File.write("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) File.close() #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format( # i_episode, reward_sum, reward_batch)) print("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) #print('!!!!') epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, epoch)
from osim.env import RunEnv env = RunEnv(visualize=True) observation = env.reset(difficulty=0) for i in range(200): observation, reward, done, info = env.step(env.action_space.sample()) print(reward) if done: break
def standalone_headless_isolated(pq, cq, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...',pq,cq) try: import traceback from osim.env import RunEnv e = RunEnv(visualize=True,max_obstacles=0) # bind_alternative_pelvis_judgement(e) # use_alternative_episode_length(e) except Exception as e: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') # conn.send(('error',e)) # conn.put(('error',e)) cq.put(('error',e)) def floatify(n_p): return [float(n_p[i]) for i in range(len(n_p))] try: previous_o = None while True: # msg = conn.recv() # msg = conn.get() msg = pq.get() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': #or (previous_o==None and msg[0]=='step'): o = e.reset(difficulty=0) o = floatify(o) o_processed = generate_observation(o, o) previous_o = o cq.put(o_processed) elif msg[0] == 'step': actions = msg[1] o,r,d,i = e.step(np.array(actions)) o = floatify(o) # floatify the observation o_processed = generate_observation(o, previous_o) previous_o = o cq.put((o_processed, r, d, i)) elif msg[0] == 'action_space': a_s = e.action_space r_a_s = (a_s.low.tolist(), a_s.high.tolist(), a_s.shape) cq.put(r_a_s) elif msg[0] == 'observation_space': o_s = get_observation_space() r_o_s = (o_s['low'].tolist(), o_s['high'].tolist(),o_s['shape']) cq.put(r_o_s) else: cq.close() pq.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
def test_first_obs(self): env = RunEnv(visualize=False) observation_start = env.reset() observation, reward, done, info = env.step(env.action_space.sample()) self.assertAlmostEqual(observation_start[-1], observation[-1]) self.assertAlmostEqual(observation_start[-2], observation[-2])
class Actor(multiprocessing.Process): def __init__(self, args, task_q, result_q, actor_id, monitor): multiprocessing.Process.__init__(self) self.task_q = task_q self.result_q = result_q self.args = args self.monitor = False def act(self, obs): obs = np.expand_dims(obs, 0) action_dist_mu, action_dist_logstd = self.session.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs: obs}) # samples the guassian distribution act = action_dist_mu + np.exp(action_dist_logstd)*np.random.randn(*action_dist_logstd.shape) return act.ravel(), action_dist_mu, action_dist_logstd def run(self): self.env = RunEnv(visualize=False) self.env.reset(difficulty = 2, seed = int(time.time())) if self.monitor: self.env.monitor.start('monitor/', force=True) # tensorflow variables (same as in model.py) self.observation_size = 55+7 self.action_size = np.prod(self.env.action_space.shape) self.hidden_size = 128 weight_init = tf.random_uniform_initializer(-0.05, 0.05) bias_init = tf.constant_initializer(0) # tensorflow model of the policy self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.debug = tf.constant([2,2]) with tf.variable_scope("policy-a"): h1 = fully_connected(self.obs, self.observation_size, self.hidden_size, weight_init, bias_init, "policy_h1") h1 = tf.nn.relu(h1) h2 = fully_connected(h1, self.hidden_size, self.hidden_size, weight_init, bias_init, "policy_h2") h2 = tf.nn.relu(h2) h3 = fully_connected(h2, self.hidden_size, self.action_size, weight_init, bias_init, "policy_h3_1") h3 = tf.nn.tanh(h3,name="policy_h3") action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, self.action_size)).astype(np.float32), name="policy_logstd") self.action_dist_mu = h3 self.action_dist_logstd = tf.tile(action_dist_logstd_param, tf.stack((tf.shape(self.action_dist_mu)[0], 1))) config = tf.ConfigProto( device_count = {'CPU': 0} ) self.session = tf.Session() self.session.run(tf.initialize_all_variables()) var_list = tf.trainable_variables() self.set_policy = SetPolicyWeights(self.session, var_list) while True: # get a task, or wait until it gets one next_task = self.task_q.get(block=True) if next_task == 1: # the task is an actor request to collect experience path = self.rollout() self.task_q.task_done() self.result_q.put(path) elif next_task == 2: print "kill message" if self.monitor: self.env.monitor.close() self.task_q.task_done() break else: # the task is to set parameters of the actor policy self.set_policy(next_task) # super hacky method to make sure when we fill the queue with set parameter tasks, # an actor doesn't finish updating before the other actors can accept their own tasks. time.sleep(0.1) self.task_q.task_done() return def rollout(self): obs, actions, rewards, action_dists_mu, action_dists_logstd = [], [], [], [], [] self.old_observation = None plain_obs = self.env.reset(difficulty = 2, seed = int(time.time())) processed_observation, self.old_observation = go(plain_obs, self.old_observation, step=1) ob = filter(processed_observation) for i in xrange(self.args.max_pathlength - 1): obs.append(ob) action, action_dist_mu, action_dist_logstd = self.act(ob) action = np.clip(action,a_max=1.0,a_min=0.0) actions.append(action) action_dists_mu.append(action_dist_mu) action_dists_logstd.append(action_dist_logstd) res = self.env.step(action) processed_observation, self.old_observation = go(res[0], self.old_observation, step=1) ob = filter(processed_observation) rewards.append((res[1])) if res[2] or i == self.args.max_pathlength - 2: path = {"obs": np.concatenate(np.expand_dims(obs, 0)), "action_dists_mu": np.concatenate(action_dists_mu), "action_dists_logstd": np.concatenate(action_dists_logstd), "rewards": np.array(rewards), "actions": np.array(actions)} return path break
running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] last_state = 41 * [0] for i_episode in count(1): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) last_state, state = process_observation(last_state, state) #print(len(state)) state = numpy.array(state) state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state) #print(action)
def train(rank, params, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, test_n): torch.manual_seed(params.seed) #env = gym.make(params.env_name) env = RunEnv(visualize=False) #num_inputs = env.observation_space.shape[0] #num_outputs = env.action_space.shape[0] num_inputs = params.num_inputs num_outputs = params.num_outputs model = Model(num_inputs, num_outputs) last_state = [] memory = ReplayMemory(params.exploration_size) #state = env.reset() state = env.reset(difficulty=0) last_state, state = process_observation(last_state, state) state = numpy.array(state) state = Variable(torch.Tensor(state).unsqueeze(0)) done = True episode_length = 0 while True: episode_length += 1 model.load_state_dict(shared_model.state_dict()) w = -1 av_reward = 0 nb_runs = 0 reward_0 = 0 t = -1 while w < params.exploration_size: t += 1 states = [] actions = [] rewards = [] values = [] returns = [] advantages = [] av_reward = 0 cum_reward = 0 cum_done = 0 # Perform K steps for step in range(params.num_steps): w += 1 shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) states.append(state) mu, sigma_sq, v = model(state) eps = torch.randn(mu.size()) action = (mu + sigma_sq.sqrt() * Variable(eps)) actions.append(action) values.append(v) env_action = action.data.squeeze().numpy() state, reward, done, _ = env.step(env_action) last_state, state = process_observation(last_state, state) state = numpy.array(state) done = (done or episode_length >= params.max_episode_length) cum_reward += reward reward = max(min(reward, 1), -1) rewards.append(reward) if done: cum_done += 1 av_reward += cum_reward cum_reward = 0 episode_length = 0 state = env.reset(difficulty=0) last_state = [] last_state, state = process_observation(last_state, state) state = numpy.array(state) state = Variable(torch.Tensor(state).unsqueeze(0)) if done: break # one last step R = torch.zeros(1, 1) if not done: _, _, v = model(state) R = v.data # compute returns and GAE(lambda) advantages: values.append(Variable(R)) R = Variable(R) A = Variable(torch.zeros(1, 1)) for i in reversed(range(len(rewards))): td = rewards[i] + params.gamma * values[i + 1].data[ 0, 0] - values[i].data[0, 0] A = float(td) + params.gamma * params.gae_param * A advantages.insert(0, A) R = A + values[i] returns.insert(0, R) # store usefull info: memory.push([states, actions, returns, advantages]) # policy grad updates: av_reward /= float(cum_done + 1) model_old = Model(num_inputs, num_outputs) model_old.load_state_dict(model.state_dict()) if t == 0: reward_0 = av_reward - (1e-2) #batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(params.batch_size) for k in range(params.num_epoch): # load new model model.load_state_dict(shared_model.state_dict()) model.zero_grad() # get initial signal signal_init = traffic_light.get() # new mini_batch batch_states, batch_actions, batch_returns, batch_advantages = memory.sample( params.batch_size) # old probas mu_old, sigma_sq_old, v_pred_old = model_old(batch_states.detach()) probs_old = normal(batch_actions, mu_old, sigma_sq_old) # new probas mu, sigma_sq, v_pred = model(batch_states) probs = normal(batch_actions, mu, sigma_sq) # ratio ratio = probs / (1e-10 + probs_old) # clip loss surr1 = ratio * torch.cat( [batch_advantages] * num_outputs, 1) # surrogate from conservative policy iteration surr2 = ratio.clamp(1 - params.clip, 1 + params.clip) * torch.cat( [batch_advantages] * num_outputs, 1) loss_clip = -torch.mean(torch.min(surr1, surr2)) # value loss vfloss1 = (v_pred - batch_returns)**2 v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp( -params.clip, params.clip) vfloss2 = (v_pred_clipped - batch_returns)**2 loss_value = 0.5 * torch.mean(torch.max(vfloss1, vfloss2)) # entropy loss_ent = -params.ent_coeff * torch.mean( probs * torch.log(probs + 1e-5)) # total total_loss = (loss_clip + loss_value + loss_ent) #print(total_loss.data[0]) # before step, update old_model: model_old.load_state_dict(model.state_dict()) # prepare for step total_loss.backward(retain_variables=True) #ensure_shared_grads(model, shared_model) #shared_model.cum_grads() shared_grad_buffers.add_gradient(model) counter.increment() # wait for a new signal to continue while traffic_light.get() == signal_init: pass test_n += 1 memory.clear()
class WrapperEnv(): def __init__(self, game='l2r', visualize=False, max_obstacles=10, skip_count=1): self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles) self.step_count = 0 self.old_observation = None self.skip_count = 1 # skip_count # 4 self.last_x = 0 self.current_x = 0 self.observation_space_shape = (76, ) self.action_space = self.env.action_space self.difficulty = 2 def obg(self, plain_obs): # observation generator # derivatives of observations extracted here. processed_observation, self.old_observation = go(plain_obs, self.old_observation, step=self.step_count) return np.array(processed_observation) def process_action(self, action): processed_action = [(v + 1.0) / 2 for v in action] return processed_action def step(self, action): action = [float(action[i]) for i in range(len(action))] action = self.process_action(action) import math for num in action: if math.isnan(num): print('NaN met', action) raise RuntimeError('this is bullshit') sr = 0 sp = 0 o, oo = [], [] d, i = 0, 0 self.last_x = self.current_x for j in range(self.skip_count): self.step_count += 1 oo, r, d, i = self.env.step(action) self.current_x = oo[1] headx = oo[22] px = oo[1] py = oo[2] kneer = oo[7] kneel = oo[10] lean = min(0.3, max(0, px - headx - 0.15)) * 0.05 joint = sum([max(0, k - 0.1) for k in [kneer, kneel]]) * 0.03 # * 0.03 penalty = lean + joint o = self.obg(oo) sr += r sp += penalty if d is True: break res = [o, sr, d, sp] # res = [o, sr, d, i] return res def reset(self, difficulty=2): self.difficulty = difficulty self.step_count = 0 self.old_observation = None oo = self.env.reset(difficulty=difficulty) self.last_x = oo[1] self.current_x = oo[1] o = self.obg(oo) return o def seed(self, s): self.env.seed(s)
from osim.env import RunEnv import opensim env = RunEnv(visualize=True) observation = env.reset(seed=0) s = 0 for s in range(50000): d = False if s == 30: state_old = opensim.State(env.osim_model.state) print("State stored") print(state_old) if s % 50 == 49: env.osim_model.revert(state_old) state_old = opensim.State(state_old) print("Rollback") print(state_old) o, r, d, i = env.step(env.action_space.sample())
def train(rank, args, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [0] * 41 last_v = [0] * 10 #last_state = numpy.zeros(48) env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) #running_state = ZFilter((num_inputs,), clip=5) start_time = time.time() for i_episode in range(args.start_epoch + 1, 999999): #print(shared_obs_stats.n[0]) #print('hei') #if rank == 0: # print(running_state.rs._n) signal_init = traffic_light.get() memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) #state = numpy.array(state) last_state, last_v, state = process_observation( last_state, last_v, state) state = numpy.array(state) #state = running_state(state) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #print(state) #return #print(AA) #print(type(AA)) #print(type(state)) #print(AA.shape) #print(state.shape) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print(ac_net.affine1.weight) print(ac_net.affine1.weight.data) print('ERROR') #action = select_action_actor_critic(state,ac_net) #action = action.data[0].numpy() #state = state + numpy.random.rand(args.feature)*0.001 raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(action) reward += A _, A, _, _ = env.step(action) reward += A BB = numpy.append(action, action) next_state, A, done, _ = env.step(BB) reward += A #print(next_state) #last_state = process_observation(state) last_state, last_v, next_state = process_observation( last_state, last_v, next_state) next_state = numpy.array(next_state) #print(next_state) #print(next_state.shape) #return reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, ac_net, opt_ac) shared_grad_buffers.add_gradient(ac_net) counter.increment() epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print( 'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}' .format( i_episode, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch) # wait for a new signal to continue while traffic_light.get() == signal_init: pass
from osim.env import RunEnv import numpy as np import time from Preprocessing import Preprocessing import sys #sys.path.insert(0, sys.path[0] + '/DDPG/DDPG.py') from DDPG.DDPG import DDPG #env = RunEnv(visualize=False) env = RunEnv(visualize=True) observation = env.reset(difficulty=0) episodes = 100000 agent = DDPG(.9, 2000, 54, 18, .0001, criticpath='critic', actorpath='actor') for episode in range(0, episodes): #env.step(action) # action is a list of length 18. values between [0,1] ## specifics: 9 muscles per leg, 2 legs = 18. action = env.action_space.sample() observation, reward, done, info = env.step(action) observation = np.array(observation) Preprocess = Preprocessing(observation, delta=0.01) prevState = Preprocess.GetState(observation) agent.step = 0 agent.OUprocess(.312, 0.15, 0.0) for i in range(1, 1000): if i > 1:
from osim.env import RunEnv import argparse import numpy as np parser = argparse.ArgumentParser( description='Train or test neural net motor controller') parser.add_argument('--seed', type=int, default=None) args = parser.parse_args() env = RunEnv(visualize=True) if not args.seed: seed = np.random.randint(2**32 - 1) else: seed = args.seed print("Seed = %d" % seed) observation = env.reset(difficulty=2, seed=args.seed) observation, reward, done, info = env.step(env.action_space.sample()) raw_input()
from osim.env import RunEnv import numpy as np import copy import pickle env = RunEnv(visualize=False) observation = env.reset(difficulty = 0) sin=np.sin file_Name = "w_best" array=np.array T=4 alpha=0.01 alpha_0=0.01 #TODO: we should exploit the Fourier property for which higher harmonics weights tend to decays as 1/x^n for smooth and continous functions #I initialize to 0 the weights list, 4 weights for each muscle (I compose the periodic function with 4 elements of a Fourier Series) #I define weights only for 9 periodic functions, as I assume that the legs move symmetrically in time. w=[] for i in range(9): w.append(np.array([0.,0.,0.,0.,0.,0.,0.,0.])) def output(a,T,t): # Output of a 4th degree Fourier Series of sin.
return rvel rvel = relative_vel(vel) left_rvel = relative_vel(left_vel) right_rvel = relative_vel(right_vel) central += [v * 10 for v in rvel] left += [v * 10 for v in left_rvel] right += [v * 10 for v in right_rvel] left += [np.clip(0.0 - obs.left_toe_y, 0.0, 0.05) * 20] left += [np.clip(0.05 - obs.left_talus_y, 0.0, 0.05) * 20] right += [np.clip(0.0 - obs.right_toe_y, 0.0, 0.05) * 20] right += [np.clip(0.05 - obs.right_talus_y, 0.0, 0.05) * 20] extero = self.draw_balls(obs.pelvis_x) self.step += 1 #print(len(central), len(left), len(right), len(extero)) return central + left + right + extero if __name__ == '__main__': from osim.env import RunEnv env = RunEnv(visualize=False) state = env.reset() fg = FeatureGenerator() fg.gen(state)
def test(rank, params, shared_model, shared_obs_stats, test_n): PATH_TO_MODEL = '../models/' + params.bh torch.manual_seed(params.seed + rank) best_result = -1000 work_dir = mkdir('exp', 'ppo') monitor_dir = mkdir(work_dir, 'monitor') last_state = [] #env = gym.make(params.env_name) if params.render: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) #env = wrappers.Monitor(env, monitor_dir, force=True) #num_inputs = env.observation_space.shape[0] #num_outputs = env.action_space.shape[0] num_inputs = params.num_inputs num_outputs = params.num_outputs model = Model(num_inputs, num_outputs) #state = env.reset() state = env.reset(difficulty=0) last_state, state = process_observation(last_state, state) state = numpy.array(state) state = Variable(torch.Tensor(state).unsqueeze(0)) reward_sum = 0 done = True start_time = time.time() episode_length = 0 epoch = 0 while True: #print(episode_length) episode_length += 1 model.load_state_dict(shared_model.state_dict()) shared_obs_stats.observes(state) #print(shared_obs_stats.n[0]) state = shared_obs_stats.normalize(state) mu, sigma_sq, _ = model(state) eps = torch.randn(mu.size()) action = mu + sigma_sq.sqrt() * Variable(eps) env_action = action.data.squeeze().numpy() state, reward, done, _ = env.step(env_action) last_state, state = process_observation(last_state, state) state = numpy.array(state) reward_sum += reward if done: print("Time {}, epoch {} ,episode reward {}, episode length {}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), epoch, reward_sum, episode_length)) epoch = epoch + 1 if reward_sum > best_result: best_result = reward_sum save_model( { 'epoch': epoch, 'bh': params.bh, 'state_dict': model.state_dict(), #'optimizer' : shared_obs_stats.state_dict(), }, PATH_TO_MODEL, 'best') if epoch % 100 == 1: save_model( { 'epoch': epoch, 'bh': params.bh, 'state_dict': model.state_dict(), #'optimizer' : shared_obs_stats.state_dict(), }, PATH_TO_MODEL, epoch) reward_sum = 0 episode_length = 0 state = env.reset(difficulty=0) last_state = [] last_state, state = process_observation(last_state, state) state = numpy.array(state) time.sleep(10) state = Variable(torch.Tensor(state).unsqueeze(0))
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [1] * 48 if args.render and can_save: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) ac_net.zero_grad() num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) last_state = process_observation(state) state = process_observation(state) last_state, state = transform_observation(last_state, state) state = numpy.array(state) #global last_state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #state = running_state(state) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print('ERROR') raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() BB = numpy.append(action, action) #print(BB) reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(BB) reward += A _, A, _, _ = env.step(BB) reward += A next_state, A, done, _ = env.step(BB) reward += A next_state = process_observation(next_state) last_state, next_state = transform_observation( last_state, next_state) next_state = numpy.array(next_state) reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac) #print('backpropagate:') #print(time.time()-timer) epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'. format(i_episode, reward_sum, reward_batch)) if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch)
# Plots the action sample space to see the range from osim.env import RunEnv import matplotlib.pyplot as plt env = RunEnv(visualize=False) env.reset(difficulty=0) samples = [env.action_space.sample() for i in range(300)] y1 = [v[0] for v in samples] plt.plot(samples) plt.show()
def standalone_headless_isolated(pq, cq, plock): # locking to prevent mixed-up printing. plock.acquire() print('starting headless...', pq, cq) try: import traceback from osim.env import RunEnv e = RunEnv(visualize=False, max_obstacles=10) # bind_alternative_pelvis_judgement(e) # use_alternative_episode_length(e) except Exception as e: print('error on start of standalone') traceback.print_exc() plock.release() return else: plock.release() def report(e): # a way to report errors ( since you can't just throw them over a pipe ) # e should be a string print('(standalone) got error!!!') # conn.send(('error',e)) # conn.put(('error',e)) cq.put(('error', e)) def floatify(np): return [float(np[i]) for i in range(len(np))] try: while True: # msg = conn.recv() # msg = conn.get() msg = pq.get() # messages should be tuples, # msg[0] should be string # isinstance is dangerous, commented out # if not isinstance(msg,tuple): # raise Exception('pipe message received by headless is not a tuple') if msg[0] == 'reset': o = e.reset(difficulty=2) # conn.send(floatify(o)) cq.put(floatify(o)) # conn.put(floatify(o)) elif msg[0] == 'step': o, r, d, i = e.step(msg[1]) o = floatify(o) # floatify the observation cq.put((o, r, d, i)) # conn.put(ordi) # conn.send(ordi) else: # conn.close() cq.close() pq.close() del e break except Exception as e: traceback.print_exc() report(str(e)) return # end process
class OpenSim(Env): # low dimensional observations """ Class to setup the OpenSim-RL environment (https://github.com/praveen-palanisamy/pytorch-rl.git) Where the agent has to learn to run! Continuous (18 dim) action space.""" def __init__(self, args, env_ind=0): super(OpenSim, self).__init__(args, env_ind) assert self.env_type == "opensim" try: from osim.env import RunEnv except ImportError as e: self.logger.warning("WARNING: opensim not found") self.env = RunEnv(visualize=True) #self.env.seed(self.seed) # NOTE: so each env would be different # action space setup self.actions = range(self.action_dim) self.logger.warning("Action Space: %s", self.env.action_space) # state space setup self.logger.warning("State Space: %s", self.state_shape) # continuous space #if args.agent_type == "a3c": self.enable_continuous = True #args.enable_continuous def _preprocessState(self, state): # NOTE: here no preprecessing is needed return state @property def action_dim(self): return self.env.action_space.shape[0] @property def state_shape(self): return self.env.observation_space.shape[0] def render(self): #if self.mode == 2: # frame = self.env.render(mode='rgb_array') # frame_name = self.img_dir + "frame_%04d.jpg" % self.frame_ind # self.imsave(frame_name, frame) # self.logger.warning("Saved Frame @ Step: " + str(self.frame_ind) + " To: " + frame_name) # self.frame_ind += 1 # return frame #else: # return self.env.render() return def visual(self): pass def sample_random_action(self): return self.env.action_space.sample() def reset(self): self._reset_experience() self.exp_state1 = self.env.reset() return self._get_experience() def step(self, action): self.exp_action = action if self.enable_continuous: self.exp_state1, self.exp_reward, self.exp_terminal1, _ = self.env.step( self.exp_action) return self._get_experience()