def evaluate(self, evaluation_config, agent_config, env_config, random=False): config = ars.DEFAULT_CONFIG.copy() for key in agent_config.keys(): config[key] = agent_config[key] config["num_workers"] = 1 config["noise_size"] = 250000 trained_agent = ars.ARSTrainer(config, env="jumper") trained_agent.restore(evaluation_config["evaluation_file"]) #min_coordinates = [0,10,0] #max_coordinates = [0,10,0] #min_angle = int(-1*np.pi/180*1000) #max_angle = int(-min_angle) #starting_coordinates = (randint(min_coordinates[0],max_coordinates[0]), randint(min_coordinates[1],max_coordinates[1]), randint(min_coordinates[2],max_coordinates[2])) #min:10 #env_config["starting_coordinates"] = starting_coordinates #starting_angle = randint(min_angle,max_angle)/10000 #1745/10000 = 0.1745 radian = 10 degree angle #starting_angle = 0.95*np.pi/180 #env_config["starting_angle"] = starting_angle domain_buckets = evaluation_config["domain_buckets"] num_episodes = evaluation_config["num_episodes"] print(domain_buckets) if(domain_buckets is not None): # The num_episodes is used for each bucket num_buckets = int((domain_buckets["max"] - domain_buckets["min"])/domain_buckets["step"]) starting_leg_angles = [] for i in range(num_buckets): starting_leg_angles.append(np.random.uniform(low=domain_buckets["min"]+domain_buckets["step"]*i, high=domain_buckets["min"]+domain_buckets["step"]*(i+1), size=(evaluation_config["num_episodes"],))) print(domain_buckets["min"]+domain_buckets["step"]*i, domain_buckets["min"]+domain_buckets["step"]*(i+1)) num_episodes *= num_buckets env = create_environment(env_config) cumulative_reward = 0 history = [] print(starting_leg_angles) for i in range(num_episodes): min_history_dict = {} bucket_index = i//evaluation_config["num_episodes"] episode_bucket_index = i%evaluation_config["num_episodes"] starting_leg_angle = [starting_leg_angles[bucket_index][episode_bucket_index], 0] #print("starting_angle: {:}".format(starting_leg_angle)) env.setStartingLegAngle(starting_leg_angle) reward = self.run_episode(env, trained_agent, random=random) #self.printer.reward(reward) #history.append({"reward":reward, "coordiantes": starting_coordinates, "angle in degree": starting_angle*180/np.pi}) min_history_dict = {"reward":reward} if(domain_buckets is not None): min_history_dict.update({"bucket_index": bucket_index, "episode_bucket_index": episode_bucket_index, "starting_leg_angle": starting_leg_angle}) #, "starting_leg_angle": env.starting_leg_angle, "starting_height": env.starting_height}) history.append(min_history_dict) cumulative_reward += reward #starting_coordinates = (randint(min_coordinates[0],max_coordinates[0]), randint(min_coordinates[1],max_coordinates[1]), randint(min_coordinates[2],max_coordinates[2])) #min:10 #env.setStartingCoordinates(starting_coordinates) #starting_angle = 0.95*np.pi/180 #starting_angle = randint(min_angle,max_angle)/10000 #env.setStartingAngle(starting_angle) self.printer.history(history)
def test_ars_compilation(self): """Test whether an ARSTrainer can be built on all frameworks.""" ray.init(num_cpus=2, local_mode=True) config = ars.DEFAULT_CONFIG.copy() # Keep it simple. config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = None num_iterations = 2 for _ in framework_iterator(config, ("torch", "tf")): plain_config = config.copy() trainer = ars.ARSTrainer(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_action(trainer)
def test_ars_compilation(self): """Test whether an ARSTrainer can be built on all frameworks.""" config = ars.DEFAULT_CONFIG.copy() # Keep it simple. config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = None config["noise_size"] = 2500000 # Test eval workers ("normal" Trainer eval WorkerSet, unusual for ARS). config["evaluation_interval"] = 1 config["evaluation_num_workers"] = 1 num_iterations = 2 for _ in framework_iterator(config): plain_config = config.copy() trainer = ars.ARSTrainer(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) trainer.stop()
config = ars.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["noise_stdev"] = 0.02 config["num_rollouts"] = 100 config["rollouts_used"] = 50 config["sgd_stepsize"] = 0.03 config["noise_size"] = 250000000 config["eval_prob"] = 0.5 #config["env_config"] = {"observation":'end_points', "control_type":'current_length_mod'} #restore_model_path = "evaluate/ARS_jumper_15f133ce_2020-01-13_18-04-04oajwg41_/checkpoint_210/checkpoint-210" restore_model_path = "evaluate/checkpoint_180/checkpoint-180" ##tune.run("ARS", restore=restore_model_path, stop={"timesteps_total": 100000}, config={"env": "jumper"}) test_agent = ars.ARSTrainer(config, env="jumper") test_agent.restore(restore_model_path) env = create_environment("a") for i in range(50): state = env.reset() cumulative_reward = 0 done = False #print(cumulative_reward) while not done: action = test_agent.compute_action(state) #print(action) #action = env.action_space.sample() state, reward, done, _ = env.step(action) cumulative_reward += reward print("%#%#%#%#%#%Total Reward: {:}".format(cumulative_reward))
def render(checkpoint, home_path): """ Renders pybullet and mujoco environments. """ alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0) current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0) checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint) config = json.load(open(home_path + "params.json")) config_bin = pickle.load(open(home_path + "params.pkl", "rb")) ray.shutdown() import pybullet_envs ray.init() ModelCatalog.register_custom_model("RBF", RBFModel) ModelCatalog.register_custom_model("MLP_2_64", MLP) ModelCatalog.register_custom_model("linear", Linear) if alg == "PPO": trainer = ppo.PPOTrainer(config_bin) if alg == "SAC": trainer = sac.SACTrainer(config) if alg == "DDPG": trainer = ddpg.DDPGTrainer(config) if alg == "PG": trainer = pg.PGTrainer(config) if alg == "A3C": trainer = a3c.A3CTrainer(config) if alg == "TD3": trainer = td3.TD3Trainer(config) if alg == "ES": trainer = es.ESTrainer(config) if alg == "ARS": trainer = ars.ARSTrainer(config) # "normalize_actions": true, trainer.restore(checkpoint_path) if "Bullet" in current_env: env = gym.make(current_env, render=True) else: env = gym.make(current_env) #env.unwrapped.reset_model = det_reset_model env._max_episode_steps = 10000 obs = env.reset() action_hist = [] m_act_hist = [] state_hist = [] obs_hist = [] reward_hist = [] done = False step = 0 for t in range(10000): # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now # mean_actions = out_dict['behaviour_logits'][:17] # actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) sampled_actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) actions = sampled_actions obs, reward, done, _ = env.step(np.asarray(actions)) # env.camera_adjust() env.render(mode='human') time.sleep(0.01) # env.render() # env.render(mode='rgb_array', close = True) # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0]) # if step % 1000 == 0: # env.reset() # step += 1 action_hist.append(np.copy(actions)) obs_hist.append(np.copy(obs)) reward_hist.append(np.copy(reward)) if done: obs = env.reset() # print(sum(reward_hist)) # print((obs_hist)) #plt.plot(action_hist) #plt.figure() #plt.figure() #plt.plot(obs_hist) #plt.figure() # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit) # trainer.compute_action(obs, full_fetch=True) trainer.compute_action(obs)