def submit(agent, logger, jump=False): token = None assert token is not None, "You need to provide your token to submit()" # Settings remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) # Create environment new_ob = client.env_create(token) agent.ob_processor.reset() zero_action = np.zeros(agent.env.action_space.shape).tolist() first_frame = True done = False # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one episode_count = 0 episode_steps = 0 episode_reward = 0 all_rewards = [] while True: # ignore first frame because it contains phantom obstacle if first_frame: new_ob, reward, done, info = client.env_step(zero_action, True) episode_reward += reward episode_steps += 1 first_frame = False assert not done, "Episode finished in one step" continue new_ob = agent.ob_processor.process(new_ob) observation = np.reshape(new_ob, [1, -1]) action, _ = agent.actor.predict(observation) action = np.clip(action, agent.act_low, agent.act_high) act_to_apply = action.squeeze() if self.jump: act_to_apply = np.tile(act_to_apply, 2) [new_ob, reward, done, info] = client.env_step(act_to_apply.tolist(), True) episode_steps += 1 episode_reward += reward logger.info("step={}, reward={}".format(episode_steps, reward)) if done: episode_count += 1 logger.info("Episode={}, steps={}, reward={}".format( episode_count, episode_steps, episode_reward)) all_rewards.append(episode_reward) episode_steps = 0 episode_reward = 0 new_ob = client.env_reset() agent.ob_processor.reset() first_frame = True if not new_ob: break client.submit() logger.info("All rewards: {}".format(all_rewards))
def submit(): remote_base = "http://grader.crowdai.org:1729" crowdai_token = "[YOUR_CROWD_AI_TOKEN_HERE]" client = Client(remote_base) task_fn = lambda: LTR() task = task_fn() state_dim = task.env.observation_space.shape[0] action_dim = task.env.action_space.shape[0] with open('data/ddpg-model-LearningToRun.bin', 'rb') as f: model = pickle.load(f) actor = DDPGActorNet(state_dim, action_dim) actor.load_state_dict(model) # Create environment state = client.env_create(crowdai_token) total_reward = 0.0 while True: action = actor.predict(np.stack([state]), to_numpy=True).flatten() [state, reward, done, info] = client.env_step(action, True) total_reward += reward print(observation) if done: observation = client.env_reset() if not observation: break print total_reward client.submit()
def up(): # uploading to CrowdAI # global _stepsize # _stepsize = 0.01 apikey = open('apikey.txt').read().strip('\n') print('apikey is', apikey) import opensim as osim from osim.http.client import Client from osim.env import RunEnv # Settings remote_base = "http://grader.crowdai.org:1729" crowdai_token = apikey client = Client(remote_base) # Create environment observation = client.env_create(crowdai_token) # old_observation = None stepno = 0 epino = 0 total_reward = 0 old_observation = None def obg(plain_obs): nonlocal old_observation, stepno processed_observation, old_observation = go(plain_obs, old_observation, step=stepno) return np.array(processed_observation) print('environment created! running...') # Run a single step while True: proc_observation = obg(observation) [observation, reward, done, info] = client.env_step( [float(i) for i in list(agent.act(proc_observation))], True) stepno += 1 total_reward += reward print('step', stepno, 'total reward', total_reward) # print(observation) if done: observation = client.env_reset() old_observation = None print('>>>>>>>episode', epino, ' DONE after', stepno, 'got_reward', total_reward) total_reward = 0 stepno = 0 epino += 1 if not observation: break print('submitting...') client.submit()
def submit(identifier, policy_fn, seed, iter): client = Client(remote_base) # Create environment observation = client.env_create(crowdai_token, env_id="ProstheticsEnv") # IMPLEMENTATION OF YOUR CONTROLLER pi = train(identifier, policy_fn, 1, 1, seed, save_final=False, play=True, bend=0) load_state(identifier, iter) while True: ob = state_desc_to_ob(observation) action = pi.act(False, np.array(ob))[0].tolist() for _ in range(param.action_repeat): [observation, reward, done, info] = client.env_step(action, True) if done: break if done: observation = client.env_reset() if not observation: break client.submit()
def submit(pi): remote_base = "http://grader.crowdai.org:1729" crowdai_token = "0dd7c22f5eb61cb4453b5a5b8e510656" client = Client(remote_base) observation = client.env_create(crowdai_token, env_id="ProstheticsEnv") frame = score = 0 while True: a = pi.act(desc_to_list(observation)) [observation, reward, done, _] = client.env_step(a.tolist(), True) score += reward frame += 1 if done: print("score=%0.2f in %i frames" % (score, frame)) frame = score = 0 observation = client.env_reset() if not observation: break client.submit()
class RemoteProstheticsEnv(gym.Env): def __init__(self, base, token, round): self.base = base self.token = token self.client = None ## simulate local env self.osim_model = OSmodel() self.time_limit = 300 if round == 1 else 1000 def reset(self, project=True): if self.client == None: self.client = Client(self.base) obs = self.client.env_create(self.token, env_id='ProstheticsEnv') self.osim_model.istep = 0 return obs else: ### It is not allowed to call reset() twice in submitting. raise NotImplementedError def step(self, action, project=True): self.osim_model.istep += 1 [obs, reward, done, info] = self.client.env_step(action.tolist(), render=True) if done: self.osim_model.istep = 0 obs = self.client.env_reset() if not obs: done = True else: done = False return obs, reward, done, info
def submit_agent(args, model_params): ########################################################## actor_fn, params_actor, params_crit = build_model_test(**model_params) weights = [p.get_value() for p in params_actor] actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) # Settings remote_base = "http://grader.crowdai.org:1729" token = args.token client = Client(remote_base) # Create environment di = client.env_create(token, env_id="ProstheticsEnv") stat = [] ep = 1 ii = 0 reward_sum = 0 print('\n\n#################################################\n\n') while True: ii += 1 proj = env.dict_to_vec(di) action = actor.act(proj) action += np.random.rand(len(action)) / 10. [di, reward, done, info] = client.env_step(action.tolist(), True) reward_sum += reward print('ep: ' + str(ep) + ' >> step: ' + str(int(ii)) + ' >> reward: ' + format(reward, '.2f') + ' \t' + str(int(reward_sum)) + '\t >> pelvis X Y Z: \t' + format(di['body_pos']['pelvis'][0], '.2f') + '\t' + format(di['body_pos']['pelvis'][1], '.2f') + '\t' + format(di['body_pos']['pelvis'][2], '.2f')) if done: print('\n\n#################################################\n\n') stat.append([ep, ii, reward_sum]) di = client.env_reset() ep += 1 ii = 0 reward_sum = 0 if not di: break for e in stat: print(e) print('\n\nclient.submit()\n\n') client.submit() ########################################################## print('\n\n#################################################\n\n') print('DONE\n\n')
def submit(actor, critic, args, act_update_fn): act_fn, _, _ = act_update_fn(actor, critic, None, None, args) client = Client(REMOTE_BASE) all_episode_metrics = [] episode_metrics = { "reward": 0.0, "step": 0, } observation_handler = create_observation_handler(args) action_handler = create_action_handler(args) observation = client.env_create(args.token) action = np.zeros(ACTION_SHAPE, dtype=np.float32) observation = observation_handler(observation, action) submitted = False while not submitted: print(episode_metrics["reward"]) action = act_fn(observation) observation, reward, done, _ = client.env_step( action_handler(action).tolist()) episode_metrics["reward"] += reward episode_metrics["step"] += 1 if done: all_episode_metrics.append(episode_metrics) episode_metrics = { "reward": 0.0, "step": 0, } observation_handler = create_observation_handler(args) action_handler = create_action_handler(args) observation = client.env_create(args.token) if not observation: submitted = True break action = np.zeros(ACTION_SHAPE, dtype=np.float32) observation = observation_handler(observation, action) else: observation = observation_handler(observation, action) df = pd.DataFrame(all_episode_metrics) pprint(df.describe()) if query_yes_no("Submit?"): client.submit()
class NIPS(object): def __init__(self, visualize=False, token=None, max_obstacles=3): logger.info("max_obstacles={}".format(max_obstacles)) if token is None: self.remote_env = False self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles) else: self.remote_env = True self.local_env = RunEnv(visualize=False, max_obstacles=max_obstacles) self.token = token self.env = Client(GRADER_URL) self.env_created = False @property def observation_space(self): if self.remote_env: # because Client() has not observation_space return self.local_env.observation_space else: return self.env.observation_space @property def action_space(self): if self.remote_env: # because Client() has not action_space return self.local_env.action_space else: return self.env.action_space def reset(self): if self.remote_env: if not self.env_created: ob = self.env.env_create(self.token) self.env_created = True else: ob = self.env.env_reset() else: ob = self.env.reset(difficulty=2) return ob def step(self, action): if self.remote_env: ob, reward, done, info = self.env.env_step(action.tolist(), True) else: ob, reward, done, info = self.env.step(action) return ob, reward, done, info def close(self): if self.remote_env: self.env.submit() else: self.env.close()
class WrapperClient(): def __init__(self, remote_base): self.client = Client(remote_base) self.ob_0 = np.array(41) self.ob_1 = np.zeros(14) # self.ob_2 = np.zeros(41) def env_create(self, token): self.ob_0 = self.preprocess( np.array( self.client.env_create("7be35dd3a64deac826068d37c2258847"))) # return np.concatenate((self.ob_0,self.ob_1,self.ob_2),axis=0) return np.concatenate((self.ob_0, self.ob_1), axis=0) def env_reset(self): ob = self.client.env_reset() if ob is None: return None self.ob_0 = self.preprocess(np.array(ob)) self.ob_0[1] = 0 self.ob_1 = np.zeros(14) # self.ob_2 = np.zeros(41) # return np.concatenate((self.ob_0,self.ob_1,self.ob_2),axis=0) return np.concatenate((self.ob_0, self.ob_1), axis=0) def env_step(self, action): res = self.client.env_step(action) ob_0_post = self.ob_0 # ob_1_post = self.ob_1 # ob_2_post = self.ob_2 self.ob_0 = self.preprocess(np.array(res[0])) self.ob_0[1] = 0 self.ob_1 = (self.ob_0[22:36] - ob_0_post[22:36]) / 0.01 # self.ob_2 = self.ob_1 - ob_1_post # res[0] = np.concatenate((self.ob_0,self.ob_1,self.ob_2),axis=0) return np.concatenate((self.ob_0, self.ob_1), axis=0) return res def submit(self): self.client.submit() def preprocess(self, v): n = [1, 18, 22, 24, 26, 28, 30, 32, 34] m = [19, 23, 25, 27, 29, 31, 33, 35] for i in n: v[i] = v[i] - v[1] for i in m: v[i] = v[i] - v[2] v[20] = v[20] - v[4] v[21] = v[21] - v[5] return v
def submit(): from osim.http.client import Client remote_base = "http://grader.crowdai.org:1729" crowdai_token = "01342e360022c2def5c2cc04c5843381" Client = Client(remote_base) observation = Client.env_create(env_id="ProstheticsEnv", token=crowdai_token) while True: k = np.reshape(np.array(observation), newshape=(-1, len(observation))) ac_ind = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) ac_ind = np.reshape(ac_ind, newshape=(ac_ind.shape[1])) action = bins[ac_ind] [observation, reward, done, info] = Client.env_step(action, True) if done: observation = Client.env_reset() if not observation: break Client.submit()
def submit(args): print('start submitting') remote_base = 'http://grader.crowdai.org:1733' client = Client(remote_base) ddpg = DDPG() ddpg.load_model(args.model, load_memory=False) state = client.env_create(TOKEN) fg = FeatureGenerator() state = fg.gen(state) step = 0 ep_reward = 0 while True: print('selecting action ...', end=' ') action = ddpg.select_action(list(state)) print('client.env_step ...') next_state, reward, done, info = client.env_step(action.tolist()) next_state = fg.gen(next_state) print('step: {0:03d}, ep_reward: {1:02.08f}'.format(step, ep_reward)) state = next_state ep_reward += reward step += 1 if done: print('done') state = client.env_reset() if not state: break step = 0 ep_reward = 0 fg = FeatureGenerator() state = fg.gen(state) client.submit()
def submit(self): remote_base = 'http://grader.crowdai.org:1729' env = RunEnv(visualize=self.visualize) client = Client(remote_base) # Create environment observation = client.env_create(self.submit_token) # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: [observation, reward, done, info] = client.env_step(self.agent.forward(observation)) if done: observation = client.env_reset() if not observation: break client.submit()
class RemoteSubmit(object): def __init__(self, token, agent_type): self.token = token self.remote_base = "http://grader.crowdai.org:1729" self.client = Client(self.remote_base) # TODO:: Add agent selector if agent_type == 'random': self.agent = RandomAgent() elif agent_type == 'fixed-action': self.agent = FixedActionAgent() elif agent_type == 'a3c': self.agent = A3CAgent(num_envs=2, num_steps=50, max_frames=1000) else: status = { 'status': 'ERROR', 'error_msg': 'Not supported agent-type' } raise Exception(status) def run(self): try: status = self.agent.run() observation = self.client.env_create(self.token, env_id="ProstheticsEnv") while True: action = self.agent.get_action(observation) [observation, reward, done, info] = self.client.env_step(action, False) if done: observation = self.client.env_reset() if not observation: break self.client.submit() except Exception as e: status = {'status': 'ERROR', 'error_msg': e} raise Exception(status)
def main(): # Settings remote_base = 'http://grader.crowdai.org' # Command line parameters parser = argparse.ArgumentParser( description='Submit the result to crowdAI') parser.add_argument("hdf") parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() hdf = h5py.File(args.hdf, 'r') env = GaitEnv(visualize=False) agent = cPickle.loads(hdf['agent_snapshots']['0995'].value) agent.stochastic = False client = Client(remote_base) # Create environment observation = client.env_create(args.token) total_reward = 0 # Run a single step for i in range(501): ob = agent.obfilt(observation) a, _info = agent.act(ob) [observation, reward, done, info] = client.env_step(a.tolist(), True) print i, reward, done total_reward += reward if done: break print 'TOTAL REWARD: ', total_reward raw_input('press ENTER to submit') client.submit()
observation = env.reset(seed=args.seed) # CGP controller library = build_funcLib() ind = CGP.load_from_file(cgp_id, library) l2meval = L2MEvaluator(1e8, 1) i = 0 j = 0 r_total = 0.0 while True: inputs = l2meval.get_inputs(observation) outputs = l2meval.scale_outputs(ind.run(inputs)) if args.live: [observation, reward, done, info] = client.env_step(outputs.tolist()) else: [observation, reward, done, info] = env.step(outputs) r_total += reward print('%d %d %f %f' % (i, j, reward, r_total)) i += 1 if done: if args.live: i = 0 j += 1 r_total = 0 observation = client.env_reset() if not observation: break else: break
def upload(frameskip = 1): from osim.http.client import Client apikey = open('apikey.txt').read().strip('\n') print('Using apikey:', apikey) remote_base = "http://grader.crowdai.org:1729" crowdai_token = apikey print('connecting...') client = Client(remote_base) observation_d = client.env_create(crowdai_token, env_id="ProstheticsEnv") #observation = process_obs_dict(observation_d) print('environment created! running...') #obs_collect = [] #a_collect = [] stepno= 0 epino = 0 total_reward = 0 while True: #a = AGENT OUTPUT observation = process_obs_dict(observation_d) a, q = agent.act(observation) a = [float(i) for i in list(a)] #obs_collect.append(observation) #a_collect.append(a) for _ in range(frameskip): [observation_d, reward, done, info] = client.env_step(a, True) stepno += 1 total_reward += reward print('step',stepno,'total reward',total_reward) if done: ''' print('') print('saving...') print('') with open('upload_saves/upload_a_collect_' + str(epino) + '.p', 'wb') as f: pickle.dump(a_collect, f) with open('upload_saves/upload_obs_collect_' + str(epino) + '.p', 'wb') as f: pickle.dump(obs_collect, f) ''' observation_d = client.env_reset() print('>> episode',epino,' Done after',stepno,'got reward:',total_reward) print('') total_reward = 0 stepno = 0 epino += 1 break if not observation_d: break print('Done! Submitting...') client.submit()
client = Client(remote_base) with open(modeldir + logfile + "/best_trajectory.pkl", "rb") as f: trajectory = pickle.load(f)["trajectory"] done = False timestep = 0 observation = client.env_create(crowdai_token, env_id='ProstheticsEnv') episode_reward = 0 while True: action = trajectory[timestep % 100]["action"] [observation, reward, done, info] = client.env_step(action.detach().numpy().tolist(), True) episode_reward += reward timestep += 1 print(episode_reward, timestep) if done: observation = client.env_reset() print("Reset") if not observation: break client.submit() else: with open(modeldir + logfile + "/best_trajectory.pkl", "rb") as f: trajectory = pickle.load(f)["trajectory"] done = False
def my_controller(observation, ctr): return [float(x) for x in list(arr_list[min(ctr, max_action_steps - 1)])] ep_no = 2 arr_list = arrs[ep_no] ep_no_new = 1 arr_list_new = arrs_new[ep_no_new] arr_list = arr_list[0:180] arr_list = arr_list + arr_list_new max_action_steps = len(arr_list) ctr = 0 while True: [observation, reward, done, info] = client.env_step(my_controller(observation, ctr), True) ctr += 1 if done: observation = client.env_reset() ctr = 0 if not observation: break client.submit()
# IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) def my_controller(observation,theta,n,mean,mean_diff,var): obs_std = np.sqrt(var) state = (observation-mean)/obs_std return theta.dot(state) theta = np.genfromtxt('policy.out', delimiter = ' ', dtype = np.float32) print("Loading from policy matrix.") n = np.genfromtxt('n.out', delimiter = ' ', dtype = np.float32) print("Loading from n matrix.") mean = np.genfromtxt('mean.out', delimiter = ' ', dtype = np.float32) print("Loading from mean matrix.") mean_diff = np.genfromtxt('mean_diff.out', delimiter = ' ', dtype = np.float32) print("Loading from Mean diff matrix.") var = np.genfromtxt('var.out', delimiter = ' ', dtype = np.float32) print("Loading from Variance matrix.") tot_reward = 0 while True: [observation, reward, done, info] = client.env_step(my_controller(observation,theta,n,mean,mean_diff,var), True) tot_reward+=reward print(tot_reward) if done: observation = client.env_reset() if not observation: break print(tot_reward) client.submit()
def main(): args = parse_args() logger.configure() gamma = 0.99 tau = 0.01 normalize_returns = False normalize_observations = True batch_size = 64 action_noise = None stddev = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) critic_l2_reg = 1e-2 actor_lr = 1e-4 critic_lr = 1e-3 popart = False clip_norm = None reward_scale = 1. env = prosthetics_env.Wrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) top_model_dir = 'top-models/' # create tf sessions and graphs sess_list = [] graph_list = [] for i in range(len(args.model_files)): graph_list.append(tf.Graph()) sess_list.append(tf.Session(graph=graph_list[i])) ddpg_agents = [] for i in range(len(args.model_files)): model_name = args.model_files[i] sess = sess_list[i] graph = graph_list[i] l_size = args.layer_sizes[i] with sess.as_default(): #with U.make_session(num_cpu=1, graph=g) as sess: with graph.as_default(): #tf.global_variables_initializer() # restore agents from model files and store in ddpg_agents print("Restoring from..." + model_name) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) actor = Actor(env.action_space.shape[-1], layer_norm=True, activation='relu', layer_sizes=[l_size, l_size]) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # restore adam state and param noise restore_model_path = top_model_dir + model_name saver = tf.train.Saver(max_to_keep=500) # restore network weights saver.restore(sess, restore_model_path) adam_optimizer_store = pickle.load(open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store['actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store['actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store['actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store['critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store['critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store['critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] # intialize and prepare agent session. agent.initialize(sess) #sess.graph.finalize() agent.reset() ddpg_agents.append(agent) agent = BlendedAgent(ddpg_agents, sess_list, graph_list) if args.evaluation: # setup eval env eval_env = prosthetics_env.EvaluationWrapper(osim_env.ProstheticsEnv(visualize=False), frameskip=4, reward_shaping=True, reward_shaping_x=1, feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) eval_env.change_model(model=('3D').upper(), prosthetic=True, difficulty=0, seed=0) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) nb_eval_steps = 1000 # reward, mean_q, final_steps = evaluate_one_episode(eval_env, ddpg_agents, sess_list, graph_list, # nb_eval_steps=nb_eval_steps, # render=False) reward, mean_q, final_steps = evaluate_one_episode(eval_env, agent, nb_eval_steps, render=False) print("Reward: " + str(reward)) print("Mean Q: " + str(mean_q)) print("Final num steps: " + str(final_steps)) # Submit to crowdai competition. What a hack. :) # if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if args.crowdai_submit: remote_base = "http://grader.crowdai.org:1729" crowdai_client = Client(remote_base) eval_obs_dict = crowdai_client.env_create(args.crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action(action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step(clipped_submit_action.tolist(), True) # [eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug("done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=True, reward_shaping_x=1., feature_embellishment=True, relative_x_pos=True, relative_z_pos=True) crowdai_client.submit() for i in range(len(sess_list)): sess_list[i].close()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs, policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate, submit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ # killer = GracefulKiller() env, obs_dim, act_dim = init_osim(animate) env.seed(111 + mpi_util.rank) mpi_util.set_global_seeds(111 + mpi_util.rank) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories if mpi_util.rank == 0: #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) logger = Logger(logname=env_name, now=now) episode = 0 checkpoint = Checkpoint("saves", now) # restore from checkpoint? if restore_path: (policy, val_func, scaler, episode, obs_dim, act_dim, kl_targ) = checkpoint.restore(restore_path) else: policy = Policy(obs_dim, act_dim, kl_targ) val_func = NNValueFunction(obs_dim) scaler = Scaler(obs_dim) if mpi_util.rank == 0: # run a few episodes (on node 0) of untrained policy to initialize scaler: trajectories = run_policy(env, policy, scaler, episodes=5) unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: checkpoint.save(policy, val_func, scaler, episode) if animate: observes, actions, rewards, unscaled_obs = run_episode(env, policy, scaler, animate=animate) exit(0) if submit: # Settings #remote_base = 'http://grader.crowdai.org:1729' remote_base = 'http://grader.crowdai.org:1730' token = 'a83412a94593cae3a491f3ee28ff44e1' client = Client(remote_base) # Create environment observation = client.env_create(token) step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: obs = np.array(observation).astype(np.float32).reshape((1, -1)) print("OBSERVATION TYPE:", type(obs), obs.shape) print(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) obs = (obs - offset) * scale # center and scale observations observes.append(obs) action = policy.sample(obs).astype(np.float32).reshape((-1, 1)) print("ACTION TYPE:", type(action), action.shape) print(action) actions.append(action) [observation, reward, done, info] = client.env_step(action.tolist()) print("step:", step, "reward:", reward) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) step += 1e-3 # increment time step feature if done: print( "================================== RESTARTING =================================" ) observation = client.env_reset() step = 0.0 observes, actions, rewards, unscaled_obs = [], [], [], [] scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature if not observation: break client.submit() exit(0) ###### worker_batch_size = int(batch_size / mpi_util.nworkers) # HACK if (worker_batch_size * mpi_util.nworkers != batch_size): print("batch_size:", batch_size, " is not divisible by nworkers:", mpi_util.nworkers) exit(1) batch = 0 while episode < num_episodes: if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0: checkpoint.save(policy, val_func, scaler, episode) batch = batch + 1 trajectories = run_policy(env, policy, scaler, episodes=worker_batch_size) trajectories = mpi_util.gather_trajectories(trajectories) if mpi_util.rank == 0: # concatentate trajectories into one list trajectories = list(itertools.chain.from_iterable(trajectories)) print("did a batch of ", len(trajectories), " trajectories") print([t['rewards'].sum() for t in trajectories]) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: logger.log({ '_MeanReward': np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': np.sum([t['observes'].shape[0] for t in trajectories]) }) log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function unscaled = np.concatenate( [t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.write( display=True) # write logger results to file and stdout # if mpi_util.rank == 0 and killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False # broadcast policy weights, scaler, val_func (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val( policy, scaler, val_func) if mpi_util.rank == 0: logger.close() policy.close_sess() if mpi_util.rank == 0: val_func.close_sess()
# If TEST and TOKEN, submit to crowdAI if not args.train and args.token: agent.load_weights(args.model) # Settings remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: v = np.array(observation).reshape((env.observation_space.shape[0])) action = agent.forward(v) print("lala", client.env_step(action.tolist())) [observation, reward, done, info] = client.env_step(action.tolist()) if done: observation = client.env_reset() if not observation: break client.submit() # If TEST and no TOKEN, run some test experiments if not args.train and not args.token: agent.load_weights(args.model) # Finally, evaluate our algorithm for 1 episode. agent.test(env, nb_episodes=1, visualize=False, nb_max_episode_steps=500)
# IMPLEMENTATION OF YOUR CONTROLLER # my_controller = ... (for example the one trained in keras_rl) def my_controller(observation, time_step): bin_index = (0, ) genome = map_elites.container.grid[bin_index]["genome"] action = [] for muscle_index in range(19): action.append( genome.control_function(muscle_index=muscle_index, time_step=time_step)[0]) return action i = 0 total_reward = 0 time_step = 0 while True: time_step += 1 [observation, reward, done, info] = client.env_step(my_controller(observation, time_step), True) total_reward += reward print(i, total_reward) i += 1 if done: observation = client.env_reset() if not observation: break client.submit()
from osim.http.client import Client from osim.env import ProstheticsEnv import numpy as np import argparse # Settings remote_base = 'http://grader.crowdai.org:1729' # Command line parameters parser = argparse.ArgumentParser(description='Submit the result to crowdAI') parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() client = Client(remote_base) # Create environment observation = client.env_create(args.token, env_id="ProstheticsEnv") env = ProstheticsEnv() # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: print(observation) [observation, reward, done, info] = client.env_step(env.action_space.sample().tolist()) if done: observation = client.env_reset() if not observation: break client.submit()
import numpy as np import argparse # Settings remote_base = 'http://grader.crowdai.org:1729' # Command line parameters parser = argparse.ArgumentParser(description='Submit the result to crowdAI') parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() env = RunEnv(visualize=False) client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step # # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: v = np.array(observation).reshape((-1,1,env.observation_space.shape[0])) [observation, reward, done, info] = client.env_step(env.action_space.sample().tolist()) print(observation) if done: observation = client.env_reset() if not observation: break client.submit()
import numpy as np import argparse # Settings # remote_base = 'http://grader.crowdai.org:1729' # Submission to Round-1 remote_base = 'http://grader.crowdai.org:1730' # Submission to Round-2 # Command line parameters parser = argparse.ArgumentParser(description='Submit the result to crowdAI') parser.add_argument('--token', dest='token', action='store', required=True) args = parser.parse_args() client = Client(remote_base) # Create environment observation = client.env_create(args.token, env_id="ProstheticsEnv") env = ProstheticsEnv() # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: print(observation) [observation, reward, done, info] = client.env_step(env.action_space.sample().tolist()) if done: observation = client.env_reset() if not observation: break client.submit()
from osim.env import ProstheticsEnv remote_base = "http://grader.crowdai.org:1729" crowdai_token = "a6d6c970d3883bee5730708739550518" client = Client(remote_base) observation = client.env_create(crowdai_token, env_id="ProstheticsEnv") # env = ProstheticsEnv(visualize=True) # i = 0 def exit(default=0): import sys sys.exit(default) def get_default(): return [0.5488135, 0.71518934, 0.60276335, 0.5448832, 0.4236548, 0.6458941, 0.4375872, 0.891773, 0.96366274, 0.3834415, 0.79172504, 0.5288949, 0.56804454, 0.92559665, 0.07103606, 0.0871293, 0.0202184, 0.83261985, 0.77815676] def my_controller(observation): return get_default() while True: # [observation, reward, done, info] = client.env_step(env.action_space.sample().tolist()) [observation, reward, done, info] = client.env_step(my_controller(observation), True) if done: print("done") observation = client.env_reset() if not observation: print("break") break client.submit()
agent.save_weights(args.model, overwrite=True) # If TEST and TOKEN, submit to crowdAI if not args.train and args.token: agent.load_weights(args.model) # Settings remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: v = np.array(observation).reshape((env.observation_space.shape[0])) action = agent.forward(v) [observation, reward, done, info] = client.env_step(action.tolist()) if done: observation = client.env_reset() if not observation: break client.submit() # If TEST and no TOKEN, run some test experiments if not args.train and not args.token: agent.load_weights(args.model) # Finally, evaluate our algorithm for 1 episode. agent.test(env, nb_episodes=1, visualize=False, nb_max_episode_steps=500)
# Initial hidden state at start of episode hidden = agent.local_evaluator.policy_map['default'].get_initial_state() # Evaluation loop while True: # NOTE TODO: reduce action space is hard-coded in train.py! if use_lstm: action, hidden, logits_dict = agent.compute_action(observation=state, state=hidden) else: action = agent.compute_action(state) action = dummy_env.expand_action(action) # get back original action space, this is also a list now # Repeat same action downsample_factor number of times for _ in range(downsample_factor): state_desc, reward, done, info = client.env_step(action) #, True) if done: break state = dummy_env.process_state_desc(state_desc) # "next state" if done: state_desc = client.env_reset() if not state_desc: break state = dummy_env.process_state_desc(state_desc) if use_lstm: # Initial hidden state at start of episode hidden = agent.local_evaluator.policy_map['default'].get_initial_state()
# if not observation: # break # client.submit() # If TEST and no TOKEN, run some test experiments if args.token: agent.load_weights(args.model) remote_base = 'http://grader.crowdai.org:1729' client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one while True: v = np.array(observation).reshape((env.observation_space.shape[0])) action = agent.forward(v) [observation, reward, done, info] = client.env_step(action.tolist()) observation = process_observation(observation) total_reward += reward if done: observation = client.env_reset() if not observation: break client.submit() # Finally, evaluate our algorithm for 1 episode. #
nb_actions = env.action_space.shape[0] # Load the acton actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) actor.load_weights(args.model) client = Client(remote_base) # Create environment observation = client.env_create(args.token) # Run a single step for i in range(501): v = np.array(observation).reshape((-1,1,env.observation_space.shape[0])) [observation, reward, done, info] = client.env_step(args.token, actor.predict(v)[0].tolist(), True) if done: break client.submit(args.token)