def submit(actor, critic, args, act_update_fn): act_fn, _, _ = act_update_fn(actor, critic, None, None, args) client = Client(REMOTE_BASE) all_episode_metrics = [] episode_metrics = { "reward": 0.0, "step": 0, } observation_handler = create_observation_handler(args) action_handler = create_action_handler(args) observation = client.env_create(args.token) action = np.zeros(ACTION_SHAPE, dtype=np.float32) observation = observation_handler(observation, action) submitted = False while not submitted: print(episode_metrics["reward"]) action = act_fn(observation) observation, reward, done, _ = client.env_step( action_handler(action).tolist()) episode_metrics["reward"] += reward episode_metrics["step"] += 1 if done: all_episode_metrics.append(episode_metrics) episode_metrics = { "reward": 0.0, "step": 0, } observation_handler = create_observation_handler(args) action_handler = create_action_handler(args) observation = client.env_create(args.token) if not observation: submitted = True break action = np.zeros(ACTION_SHAPE, dtype=np.float32) observation = observation_handler(observation, action) else: observation = observation_handler(observation, action) df = pd.DataFrame(all_episode_metrics) pprint(df.describe()) if query_yes_no("Submit?"): client.submit()
class RemoteProstheticsEnv(gym.Env): def __init__(self, base, token, round): self.base = base self.token = token self.client = None ## simulate local env self.osim_model = OSmodel() self.time_limit = 300 if round == 1 else 1000 def reset(self, project=True): if self.client == None: self.client = Client(self.base) obs = self.client.env_create(self.token, env_id='ProstheticsEnv') self.osim_model.istep = 0 return obs else: ### It is not allowed to call reset() twice in submitting. raise NotImplementedError def step(self, action, project=True): self.osim_model.istep += 1 [obs, reward, done, info] = self.client.env_step(action.tolist(), render=True) if done: self.osim_model.istep = 0 obs = self.client.env_reset() if not obs: done = True else: done = False return obs, reward, done, info
def submit(agent, logger, jump=False): token = None assert token is not None, "You need to provide your token to submit()" # Settings remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) # Create environment new_ob = client.env_create(token) agent.ob_processor.reset() zero_action = np.zeros(agent.env.action_space.shape).tolist() first_frame = True done = False # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one episode_count = 0 episode_steps = 0 episode_reward = 0 all_rewards = [] while True: # ignore first frame because it contains phantom obstacle if first_frame: new_ob, reward, done, info = client.env_step(zero_action, True) episode_reward += reward episode_steps += 1 first_frame = False assert not done, "Episode finished in one step" continue new_ob = agent.ob_processor.process(new_ob) observation = np.reshape(new_ob, [1, -1]) action, _ = agent.actor.predict(observation) action = np.clip(action, agent.act_low, agent.act_high) act_to_apply = action.squeeze() if self.jump: act_to_apply = np.tile(act_to_apply, 2) [new_ob, reward, done, info] = client.env_step(act_to_apply.tolist(), True) episode_steps += 1 episode_reward += reward logger.info("step={}, reward={}".format(episode_steps, reward)) if done: episode_count += 1 logger.info("Episode={}, steps={}, reward={}".format( episode_count, episode_steps, episode_reward)) all_rewards.append(episode_reward) episode_steps = 0 episode_reward = 0 new_ob = client.env_reset() agent.ob_processor.reset() first_frame = True if not new_ob: break client.submit() logger.info("All rewards: {}".format(all_rewards))
def submit(): remote_base = "http://grader.crowdai.org:1729" crowdai_token = "[YOUR_CROWD_AI_TOKEN_HERE]" client = Client(remote_base) task_fn = lambda: LTR() task = task_fn() state_dim = task.env.observation_space.shape[0] action_dim = task.env.action_space.shape[0] with open('data/ddpg-model-LearningToRun.bin', 'rb') as f: model = pickle.load(f) actor = DDPGActorNet(state_dim, action_dim) actor.load_state_dict(model) # Create environment state = client.env_create(crowdai_token) total_reward = 0.0 while True: action = actor.predict(np.stack([state]), to_numpy=True).flatten() [state, reward, done, info] = client.env_step(action, True) total_reward += reward print(observation) if done: observation = client.env_reset() if not observation: break print total_reward client.submit()
class Client_To_Env: def __init__(self, remote_base, crowdai_token): """ Wrapper that reformats client environment to a local environment format, complete with observation_space, action_space, reset, step, submit, and time_limit. """ self.client = Client(remote_base) self.crowdai_token = crowdai_token self.reset_ = self.client.env_reset self.step = self.client.env_step self.submit = self.client.submit self.time_limit = 300 self.action_space = gym.spaces.Box(low=0, high=1, shape=(19, ),dtype=np.float32) self.first_reset = True def reset(self): if self.first_reset: self.first_reset = False obs = self.client.env_create(self.crowdai_token, env_id='ProstheticsEnv') return obs else: obs = self.reset_() return obs
def submit(pi): remote_base = "http://grader.crowdai.org:1729" crowdai_token = "0dd7c22f5eb61cb4453b5a5b8e510656" client = Client(remote_base) observation = client.env_create(crowdai_token, env_id="ProstheticsEnv") frame = score = 0 while True: a = pi.act(desc_to_list(observation)) [observation, reward, done, _] = client.env_step(a.tolist(), True) score += reward frame += 1 if done: print("score=%0.2f in %i frames" % (score, frame)) frame = score = 0 observation = client.env_reset() if not observation: break client.submit()
def up(): # uploading to CrowdAI # global _stepsize # _stepsize = 0.01 apikey = open('apikey.txt').read().strip('\n') print('apikey is', apikey) import opensim as osim from osim.http.client import Client from osim.env import RunEnv # Settings remote_base = "http://grader.crowdai.org:1729" crowdai_token = apikey client = Client(remote_base) # Create environment observation = client.env_create(crowdai_token) # old_observation = None stepno = 0 epino = 0 total_reward = 0 old_observation = None def obg(plain_obs): nonlocal old_observation, stepno processed_observation, old_observation = go(plain_obs, old_observation, step=stepno) return np.array(processed_observation) print('environment created! running...') # Run a single step while True: proc_observation = obg(observation) [observation, reward, done, info] = client.env_step( [float(i) for i in list(agent.act(proc_observation))], True) stepno += 1 total_reward += reward print('step', stepno, 'total reward', total_reward) # print(observation) if done: observation = client.env_reset() old_observation = None print('>>>>>>>episode', epino, ' DONE after', stepno, 'got_reward', total_reward) total_reward = 0 stepno = 0 epino += 1 if not observation: break print('submitting...') client.submit()
def submit(identifier, policy_fn, seed, iter): client = Client(remote_base) # Create environment observation = client.env_create(crowdai_token, env_id="ProstheticsEnv") # IMPLEMENTATION OF YOUR CONTROLLER pi = train(identifier, policy_fn, 1, 1, seed, save_final=False, play=True, bend=0) load_state(identifier, iter) while True: ob = state_desc_to_ob(observation) action = pi.act(False, np.array(ob))[0].tolist() for _ in range(param.action_repeat): [observation, reward, done, info] = client.env_step(action, True) if done: break if done: observation = client.env_reset() if not observation: break client.submit()
def submit_agent(args, model_params): ########################################################## actor_fn, params_actor, params_crit = build_model_test(**model_params) weights = [p.get_value() for p in params_actor] actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) # Settings remote_base = "http://grader.crowdai.org:1729" token = args.token client = Client(remote_base) # Create environment di = client.env_create(token, env_id="ProstheticsEnv") stat = [] ep = 1 ii = 0 reward_sum = 0 print('\n\n#################################################\n\n') while True: ii += 1 proj = env.dict_to_vec(di) action = actor.act(proj) action += np.random.rand(len(action)) / 10. [di, reward, done, info] = client.env_step(action.tolist(), True) reward_sum += reward print('ep: ' + str(ep) + ' >> step: ' + str(int(ii)) + ' >> reward: ' + format(reward, '.2f') + ' \t' + str(int(reward_sum)) + '\t >> pelvis X Y Z: \t' + format(di['body_pos']['pelvis'][0], '.2f') + '\t' + format(di['body_pos']['pelvis'][1], '.2f') + '\t' + format(di['body_pos']['pelvis'][2], '.2f')) if done: print('\n\n#################################################\n\n') stat.append([ep, ii, reward_sum]) di = client.env_reset() ep += 1 ii = 0 reward_sum = 0 if not di: break for e in stat: print(e) print('\n\nclient.submit()\n\n') client.submit() ########################################################## print('\n\n#################################################\n\n') print('DONE\n\n')
class NIPS(object): def __init__(self, visualize=False, token=None, max_obstacles=3): logger.info("max_obstacles={}".format(max_obstacles)) if token is None: self.remote_env = False self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles) else: self.remote_env = True self.local_env = RunEnv(visualize=False, max_obstacles=max_obstacles) self.token = token self.env = Client(GRADER_URL) self.env_created = False @property def observation_space(self): if self.remote_env: # because Client() has not observation_space return self.local_env.observation_space else: return self.env.observation_space @property def action_space(self): if self.remote_env: # because Client() has not action_space return self.local_env.action_space else: return self.env.action_space def reset(self): if self.remote_env: if not self.env_created: ob = self.env.env_create(self.token) self.env_created = True else: ob = self.env.env_reset() else: ob = self.env.reset(difficulty=2) return ob def step(self, action): if self.remote_env: ob, reward, done, info = self.env.env_step(action.tolist(), True) else: ob, reward, done, info = self.env.step(action) return ob, reward, done, info def close(self): if self.remote_env: self.env.submit() else: self.env.close()
class WrapperClient(): def __init__(self, remote_base): self.client = Client(remote_base) self.ob_0 = np.array(41) self.ob_1 = np.zeros(14) # self.ob_2 = np.zeros(41) def env_create(self, token): self.ob_0 = self.preprocess( np.array( self.client.env_create("7be35dd3a64deac826068d37c2258847"))) # return np.concatenate((self.ob_0,self.ob_1,self.ob_2),axis=0) return np.concatenate((self.ob_0, self.ob_1), axis=0) def env_reset(self): ob = self.client.env_reset() if ob is None: return None self.ob_0 = self.preprocess(np.array(ob)) self.ob_0[1] = 0 self.ob_1 = np.zeros(14) # self.ob_2 = np.zeros(41) # return np.concatenate((self.ob_0,self.ob_1,self.ob_2),axis=0) return np.concatenate((self.ob_0, self.ob_1), axis=0) def env_step(self, action): res = self.client.env_step(action) ob_0_post = self.ob_0 # ob_1_post = self.ob_1 # ob_2_post = self.ob_2 self.ob_0 = self.preprocess(np.array(res[0])) self.ob_0[1] = 0 self.ob_1 = (self.ob_0[22:36] - ob_0_post[22:36]) / 0.01 # self.ob_2 = self.ob_1 - ob_1_post # res[0] = np.concatenate((self.ob_0,self.ob_1,self.ob_2),axis=0) return np.concatenate((self.ob_0, self.ob_1), axis=0) return res def submit(self): self.client.submit() def preprocess(self, v): n = [1, 18, 22, 24, 26, 28, 30, 32, 34] m = [19, 23, 25, 27, 29, 31, 33, 35] for i in n: v[i] = v[i] - v[1] for i in m: v[i] = v[i] - v[2] v[20] = v[20] - v[4] v[21] = v[21] - v[5] return v
def submit(): from osim.http.client import Client remote_base = "http://grader.crowdai.org:1729" crowdai_token = "01342e360022c2def5c2cc04c5843381" Client = Client(remote_base) observation = Client.env_create(env_id="ProstheticsEnv", token=crowdai_token) while True: k = np.reshape(np.array(observation), newshape=(-1, len(observation))) ac_ind = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) ac_ind = np.reshape(ac_ind, newshape=(ac_ind.shape[1])) action = bins[ac_ind] [observation, reward, done, info] = Client.env_step(action, True) if done: observation = Client.env_reset() if not observation: break Client.submit()
class RobotControlNipsClient(RobotControlNipsLocal): ''' Initiates the simulator and the connection ''' def __init__(self): # Settings self.remote_base = "http://osim-rl-grader.aicrowd.com/" self.aicrowd_token = "a66245c8324e2d37b92f098a57ef3f99" # use your aicrowd token # your aicrowd token (API KEY) can be found at your prorfile page at https://www.aicrowd.com self.client = Client(self.remote_base) # Create environment self.observation = self.client.env_create(self.aicrowd_token, env_id='L2M2019Env') self.reward = 0
def submit(args): print('start submitting') remote_base = 'http://grader.crowdai.org:1733' client = Client(remote_base) ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) state = client.env_create(TOKEN) fg = FeatureGenerator() state = fg.gen(state) step = 0 ep_reward = 0 while True: print('selecting action ...', end=' ') action = ddpg.select_action(list(state)) print('client.env_step ...') next_state, reward, done, info = client.env_step(action.tolist()) next_state = fg.gen(next_state) print('step: {0:03d}, ep_reward: {1:02.08f}'.format(step, ep_reward)) state = next_state ep_reward += reward step += 1 if done: print('done') state = client.env_reset() if not state: break step = 0 ep_reward = 0 fg = FeatureGenerator() state = fg.gen(state) client.submit()
def submit(self): remote_base = 'http://grader.crowdai.org:1729' env = RunEnv(visualize=self.visualize) client = Client(remote_base) # Create environment observation = client.env_create(self.submit_token) # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: [observation, reward, done, info] = client.env_step(self.agent.forward(observation)) if done: observation = client.env_reset() if not observation: break client.submit()
class RemoteSubmit(object): def __init__(self, token, agent_type): self.token = token self.remote_base = "http://grader.crowdai.org:1729" self.client = Client(self.remote_base) # TODO:: Add agent selector if agent_type == 'random': self.agent = RandomAgent() elif agent_type == 'fixed-action': self.agent = FixedActionAgent() elif agent_type == 'a3c': self.agent = A3CAgent(num_envs=2, num_steps=50, max_frames=1000) else: status = { 'status': 'ERROR', 'error_msg': 'Not supported agent-type' } raise Exception(status) def run(self): try: status = self.agent.run() observation = self.client.env_create(self.token, env_id="ProstheticsEnv") while True: action = self.agent.get_action(observation) [observation, reward, done, info] = self.client.env_step(action, False) if done: observation = self.client.env_reset() if not observation: break self.client.submit() except Exception as e: status = {'status': 'ERROR', 'error_msg': e} raise Exception(status)
def main(): # Settings remote_base = 'http://grader.crowdai.org' # Command line parameters parser = argparse.ArgumentParser( description='Submit the result to crowdAI') parser.add_argument("hdf") parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() hdf = h5py.File(args.hdf, 'r') env = GaitEnv(visualize=False) agent = cPickle.loads(hdf['agent_snapshots']['0995'].value) agent.stochastic = False client = Client(remote_base) # Create environment observation = client.env_create(args.token) total_reward = 0 # Run a single step for i in range(501): ob = agent.obfilt(observation) a, _info = agent.act(ob) [observation, reward, done, info] = client.env_step(a.tolist(), True) print i, reward, done total_reward += reward if done: break print 'TOTAL REWARD: ', total_reward raw_input('press ENTER to submit') client.submit()
import numpy as np import argparse # Settings remote_base = 'http://grader.crowdai.org:1729' # Command line parameters parser = argparse.ArgumentParser(description='Submit the result to crowdAI') parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() env = RunEnv(visualize=False) client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: v = np.array(observation).reshape((-1,1,env.observation_space.shape[0])) [observation, reward, done, info] = client.env_step(env.action_space.sample().tolist()) print(observation) if done: observation = client.env_reset() if not observation: break client.submit()
def upload(frameskip = 1): from osim.http.client import Client apikey = open('apikey.txt').read().strip('\n') print('Using apikey:', apikey) remote_base = "http://grader.crowdai.org:1729" crowdai_token = apikey print('connecting...') client = Client(remote_base) observation_d = client.env_create(crowdai_token, env_id="ProstheticsEnv") #observation = process_obs_dict(observation_d) print('environment created! running...') #obs_collect = [] #a_collect = [] stepno= 0 epino = 0 total_reward = 0 while True: #a = AGENT OUTPUT observation = process_obs_dict(observation_d) a, q = agent.act(observation) a = [float(i) for i in list(a)] #obs_collect.append(observation) #a_collect.append(a) for _ in range(frameskip): [observation_d, reward, done, info] = client.env_step(a, True) stepno += 1 total_reward += reward print('step',stepno,'total reward',total_reward) if done: ''' print('') print('saving...') print('') with open('upload_saves/upload_a_collect_' + str(epino) + '.p', 'wb') as f: pickle.dump(a_collect, f) with open('upload_saves/upload_obs_collect_' + str(epino) + '.p', 'wb') as f: pickle.dump(obs_collect, f) ''' observation_d = client.env_reset() print('>> episode',epino,' Done after',stepno,'got reward:',total_reward) print('') total_reward = 0 stepno = 0 epino += 1 break if not observation_d: break print('Done! Submitting...') client.submit()
# remote_host=REMOTE_HOST, # remote_port=REMOTE_PORT # ) # # Create environment # observation = client.env_create() # Settings remote_base = "http://osim-rl-grader.aicrowd.com/" aicrowd_token = "b5f5cd09cb870c14547db176596d09e5" # use your aicrowd token # your aicrowd token (API KEY) can be found at your prorfile page at https://www.aicrowd.com client = Client(remote_base) # Create environment observation = client.env_create(aicrowd_token, env_id='L2M2019Env') """ The grader runs N simulations of at most 1000 steps each. We stop after the last one A new simulation starts when `clinet.env_step` returns `done==True` and all the simulations end when the subsequent `client.env_reset()` returns a False """ mode = '3D' difficulty = 2 visualize = False seed = None sim_dt = 0.01 sim_t = 10 timstep_limit = int(round(sim_t / sim_dt))
critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) agent = DDPG(actor, critic, memory, (158,), (19,), gamma=0.99) saver=tf.train.Saver() # IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) sess=tf.InteractiveSession() agent.initialize(sess) sess.graph.finalize() agent.reset() filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(30000)+".model" saver.restore(sess,filename) # Create environment observation = Client.env_create(env_id="ProstheticsEnv",token=crowdai_token) #print([n.name for n in tf.get_default_graph().as_graph_def().node]) def my_controller(obs): obs=np.array(dict_to_list(obs)) action=agent.pi(obs, apply_noise=False, compute_Q=False)[0] action=action.tolist() return action while True: [observation, reward, done, info] = Client.env_step(my_controller(observation), True)
import opensim as osim from osim.http.client import Client from osim.env import RunEnv import pickle, sys import numpy as np # Settings remote_base = "http://grader.crowdai.org:1729" crowdai_token = "e5d9c43bc6add5150e8e23029d118215" client = Client(remote_base) # Create environment observation = client.env_create(crowdai_token) f = open('values_jump_new.txt', 'rb') arrs = pickle.load(f) g = open('values_second_leg.txt', 'rb') arrs_new = pickle.load(g) def my_controller(observation, ctr): return [float(x) for x in list(arr_list[min(ctr, max_action_steps - 1)])] ep_no = 2 arr_list = arrs[ep_no] ep_no_new = 1
def main(): args = parse_args() logger.configure() gamma = 0.99 tau = 0.01 normalize_returns = False normalize_observations = True batch_size = 64 action_noise = None stddev = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) critic_l2_reg = 1e-2 actor_lr = 1e-4 critic_lr = 1e-3 popart = False clip_norm = None reward_scale = 1. env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) top_model_dir = 'top-models/' # create tf sessions and graphs sess_list = [] graph_list = [] for i in range(len(args.model_files)): graph_list.append(tf.Graph()) sess_list.append(tf.Session(graph=graph_list[i])) ddpg_agents = [] for i in range(len(args.model_files)): model_name = args.model_files[i] sess = sess_list[i] graph = graph_list[i] l_size = args.layer_sizes[i] with sess.as_default(): #with U.make_session(num_cpu=1, graph=g) as sess: with graph.as_default(): #tf.global_variables_initializer() # restore agents from model files and store in ddpg_agents print("Restoring from..." + model_name) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) actor = Actor(env.action_space.shape[-1], layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # restore adam state and param noise restore_model_path = top_model_dir + model_name saver = tf.train.Saver(max_to_keep=500) # restore network weights saver.restore(sess, restore_model_path) adam_optimizer_store = pickle.load(open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] # intialize and prepare agent session. agent.initialize(sess) #sess.graph.finalize() agent.reset() ddpg_agents.append(agent) agent = BlendedAgent(ddpg_agents, sess_list, graph_list) if args.evaluation: # setup eval env eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) nb_eval_steps = 1000 # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list, # nb_eval_steps=nb_eval_steps, # render=False) reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False) print("Reward: " + str(reward)) print("Mean Q: " + str(mean_q)) print("Final num steps: " + str(final_steps)) # Submit to crowdai competition. What a hack. :) # if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if args.crowdai_submit: remote_base = "http://grader.crowdai.org:1729" crowdai_client = Client(remote_base) eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action(action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True) # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug("done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) crowdai_client.submit() for i in range(len(sess_list)): sess_list[i].close()
def observation_filter(observation): return observation def action_filter(action): return action def reward_filter(observation, action, reward): return reward # need to change if observation = observation_filter(client.env_create(args.token)) action = action_filter(np.zeros(env.action_space.shape)) numo = len(observation) numa = len(action) print("numo = " + str(numo) + " numa = " + str(numa)) sumreward = 0 numsteps = 0 first = 1 while True: message = socket.recv() off = 0 if USE_BINARY_PROTO: cmd = struct.unpack_from('@B', message, offset=off)[0] off += 1 req = bytearray()
use_lstm = True if args.token: # Submit to competition # Reference: https://github.com/stanfordnmbl/osim-rl/blob/master/examples/submit.py remote_base = 'http://grader.crowdai.org:1729' # Submission to Round-1 #remote_base = 'http://grader.crowdai.org:1730' # Submission to Round-2 crowdai_token = args.token # Dummy environment, just need process_state_desc() dummy_env = create_env(agent_config['env_config']) # Create environment w/ Client client = Client(remote_base) state_desc = client.env_create(crowdai_token, env_id="ProstheticsEnv") state = dummy_env.process_state_desc(state_desc) # initial state if use_lstm: # Initial hidden state at start of episode hidden = agent.local_evaluator.policy_map['default'].get_initial_state() # Evaluation loop while True: # NOTE TODO: reduce action space is hard-coded in train.py! if use_lstm: action, hidden, logits_dict = agent.compute_action(observation=state, state=hidden) else: action = agent.compute_action(state) action = dummy_env.expand_action(action) # get back original action space, this is also a list now
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate, submit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_osim(animate) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories if mpi_util.rank == 0: #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) episode = 0 checkpoint = Checkpoint("saves", now) # restore from checkpoint? if restore_path: (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ) = checkpoint.restore(restore_path) else: policy = Policy(obs_dim, act_dim, kl_targ) val_func = NNValueFunction(obs_dim) scaler = Scaler(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: checkpoint.save(policy, val_func, scaler, episode) if animate: observes, actions, rewards, unscaled_obs = run_episode(env, policy, scaler, animate=animate) exit(0) if submit: # Settings #remote_base = 'http://grader.crowdai.org:1729' remote_base = 'http://grader.crowdai.org:1730' token = 'a83412a94593cae3a491f3ee28ff44e1' client = Client(remote_base) # Create environment observation = client.env_create(token) step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: obs = np.array(observation).astype(np.float32).reshape((1, -1)) print("OBSERVATION TYPE:", type(obs), obs.shape) print(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) obs = (obs - offset) * scale # center and scale observations observes.append(obs) action = policy.sample(obs).astype(np.float32).reshape((-1, 1)) print("ACTION TYPE:", type(action), action.shape) print(action) actions.append(action) [observation, reward, done, info] = client.env_step(action.tolist()) print("step:", step, "reward:", reward) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) step += 1e-3 # increment time step feature if done: print( "================================== RESTARTING =================================" ) observation = client.env_reset() step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature if not observation: break client.submit() exit(0) ###### worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) batch = 0 while episode < num_episodes: if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0: checkpoint.save(policy, val_func, scaler, episode) batch = batch + 1 trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
from osim.http.client import Client from osim.env import ProstheticsEnv import numpy as np import argparse # Settings remote_base = 'http://grader.crowdai.org:1729' # Command line parameters parser = argparse.ArgumentParser(description='Submit the result to crowdAI') parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() client = Client(remote_base) # Create environment observation = client.env_create(args.token, env_id="ProstheticsEnv") env = ProstheticsEnv() # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: print(observation) [observation, reward, done, info] = client.env_step(env.action_space.sample().tolist()) if done: observation = client.env_reset() if not observation: break client.submit()
from evo_rbc.main.prosthetic_map_elites.common import get_MAPElites load_path = "map_elites_repertoire_50.pkl" map_elites = get_MAPElites() map_elites.load_repertoire(load_path) # Settings remote_base = "http://grader.crowdai.org:1729" crowdai_token = "f5969a7bb0466e0da072c72d6eb6d667" client = Client(remote_base) # Create environment observation = client.env_create(crowdai_token, env_id="ProstheticsEnv") # IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) def my_controller(observation, time_step): bin_index = (0, ) genome = map_elites.container.grid[bin_index]["genome"] action = [] for muscle_index in range(19): action.append( genome.control_function(muscle_index=muscle_index, time_step=time_step)[0]) return action
args = parser.parse_args() if args.agent not in globals(): raise ValueError('[run] Agent {} not found.'.format(args.agent)) SpecifiedAgent = globals()[args.agent] if args.submit and args.nb_steps: raise ValueError('[run] Cannot train and submit agent at same time.') if args.submit and args.visualize: raise ValueError('[run] Cannot visualize agent while submitting.') if args.submit: # Submit agent client = Client(remote_base) client.env_create(crowdai_token, env_id='ProstheticsEnv') client_env = ClientToEnv(client) client_env = DictToListFull(client_env) client_env = JSONable(client_env) agent = SpecifiedAgent(client_env.observation_space, client_env.action_space) agent.submit(client_env) elif args.nb_steps: # Train agent locally env = ProstheticsEnv(visualize=args.visualize) env = ForceDictObservation(env) env = DictToListFull(env) env = JSONable(env) agent = SpecifiedAgent(env.observation_space, env.action_space) if MPI.COMM_WORLD.Get_rank() == 0: logger.configure()
# if done: # observation = env.reset() # if not observation: # break # client.submit() # If TEST and no TOKEN, run some test experiments if args.token: agent.load_weights(args.model) remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: v = np.array(observation).reshape((env.observation_space.shape[0])) action = agent.forward(v) [observation, reward, done, info] = client.env_step(action.tolist()) observation = process_observation(observation) total_reward += reward if done: observation = client.env_reset() if not observation: break client.submit()
def str2bool(v): return v.lower() in ("yes", "true", "1") remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) config = ConfigParser.ConfigParser() config.readfp(open('config.ini')) learning = False env = RunEnv(visualize=False) observation = client.env_create('2060a86df422ef5a67dea16c5320c8ad') nb_sensors = env.observation_space.shape[0] if config.get('simulation', 'agent_type') == 'cacla': ag = CaclaAg(env.action_space.shape[0], nb_sensors) else: ag = OffNFACAg(env.action_space.shape[0], nb_sensors) ag.load(int(config.get('simulation', 'load_episode'))) stop = False while not stop: #env stoch but testing only on one episode ag.start_ep(observation, learning) ac = ag.run(0, observation, learning, False, False) total_reward = 0.0 step = 0 while True: #-1 1 to 0 1