Python ARSTrainerの例、ray.rllib.agents.ars.ARSTrainer Pythonの例

コード例 #1

0

ファイルを表示

    def evaluate(self, evaluation_config, agent_config, env_config, random=False):
        config = ars.DEFAULT_CONFIG.copy()
        for key in agent_config.keys():
            config[key] = agent_config[key]
        config["num_workers"] = 1
        config["noise_size"] = 250000
        trained_agent = ars.ARSTrainer(config, env="jumper")
        trained_agent.restore(evaluation_config["evaluation_file"])
        #min_coordinates = [0,10,0]
        #max_coordinates = [0,10,0]
        #min_angle = int(-1*np.pi/180*1000)
        #max_angle = int(-min_angle)
        #starting_coordinates =  (randint(min_coordinates[0],max_coordinates[0]), randint(min_coordinates[1],max_coordinates[1]), randint(min_coordinates[2],max_coordinates[2]))	#min:10
        #env_config["starting_coordinates"] = starting_coordinates
        #starting_angle = randint(min_angle,max_angle)/10000  #1745/10000 = 0.1745 radian = 10 degree angle
        #starting_angle = 0.95*np.pi/180
        #env_config["starting_angle"] = starting_angle
        domain_buckets = evaluation_config["domain_buckets"]
        num_episodes = evaluation_config["num_episodes"]
        print(domain_buckets)
        if(domain_buckets is not None):
            # The num_episodes is used for each bucket
            num_buckets = int((domain_buckets["max"] - domain_buckets["min"])/domain_buckets["step"])
            starting_leg_angles = []
            for i in range(num_buckets):
                starting_leg_angles.append(np.random.uniform(low=domain_buckets["min"]+domain_buckets["step"]*i,
                                                        high=domain_buckets["min"]+domain_buckets["step"]*(i+1),
                                                        size=(evaluation_config["num_episodes"],)))
                print(domain_buckets["min"]+domain_buckets["step"]*i, domain_buckets["min"]+domain_buckets["step"]*(i+1))
            num_episodes *=  num_buckets
        env = create_environment(env_config)
        cumulative_reward = 0
        history = []
        print(starting_leg_angles)
        for i in range(num_episodes):
            min_history_dict = {}
            bucket_index = i//evaluation_config["num_episodes"]
            episode_bucket_index = i%evaluation_config["num_episodes"]
            starting_leg_angle = [starting_leg_angles[bucket_index][episode_bucket_index], 0]
            #print("starting_angle: {:}".format(starting_leg_angle))
            env.setStartingLegAngle(starting_leg_angle)
            reward = self.run_episode(env, trained_agent, random=random)
            #self.printer.reward(reward)
            #history.append({"reward":reward, "coordiantes": starting_coordinates, "angle in degree": starting_angle*180/np.pi})
            min_history_dict = {"reward":reward}
            if(domain_buckets is not None):
                min_history_dict.update({"bucket_index": bucket_index, "episode_bucket_index": episode_bucket_index, "starting_leg_angle": starting_leg_angle})
            #, "starting_leg_angle": env.starting_leg_angle, "starting_height": env.starting_height})
            history.append(min_history_dict)
            cumulative_reward += reward
            #starting_coordinates =  (randint(min_coordinates[0],max_coordinates[0]), randint(min_coordinates[1],max_coordinates[1]), randint(min_coordinates[2],max_coordinates[2]))	#min:10
            #env.setStartingCoordinates(starting_coordinates)
            #starting_angle = 0.95*np.pi/180
            #starting_angle = randint(min_angle,max_angle)/10000
            #env.setStartingAngle(starting_angle)
 
        self.printer.history(history)

コード例 #2

0

ファイルを表示

ファイル: test_ars.py プロジェクト: zommiommy/ray

    def test_ars_compilation(self):
        """Test whether an ARSTrainer can be built on all frameworks."""
        ray.init(num_cpus=2, local_mode=True)
        config = ars.DEFAULT_CONFIG.copy()
        # Keep it simple.
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = None

        num_iterations = 2

        for _ in framework_iterator(config, ("torch", "tf")):
            plain_config = config.copy()
            trainer = ars.ARSTrainer(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)

            check_compute_action(trainer)

コード例 #3

0

ファイルを表示

    def test_ars_compilation(self):
        """Test whether an ARSTrainer can be built on all frameworks."""
        config = ars.DEFAULT_CONFIG.copy()
        # Keep it simple.
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = None
        config["noise_size"] = 2500000
        # Test eval workers ("normal" Trainer eval WorkerSet, unusual for ARS).
        config["evaluation_interval"] = 1
        config["evaluation_num_workers"] = 1

        num_iterations = 2

        for _ in framework_iterator(config):
            plain_config = config.copy()
            trainer = ars.ARSTrainer(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)

            check_compute_single_action(trainer)
            trainer.stop()

コード例 #4

0

ファイルを表示

config = ars.DEFAULT_CONFIG.copy()
config["num_workers"] = 1
config["noise_stdev"] = 0.02
config["num_rollouts"] = 100
config["rollouts_used"] = 50
config["sgd_stepsize"] = 0.03
config["noise_size"] = 250000000
config["eval_prob"] = 0.5
#config["env_config"] = {"observation":'end_points', "control_type":'current_length_mod'}
#restore_model_path = "evaluate/ARS_jumper_15f133ce_2020-01-13_18-04-04oajwg41_/checkpoint_210/checkpoint-210"
restore_model_path = "evaluate/checkpoint_180/checkpoint-180"

##tune.run("ARS", restore=restore_model_path, stop={"timesteps_total": 100000}, config={"env": "jumper"})

test_agent = ars.ARSTrainer(config, env="jumper")

test_agent.restore(restore_model_path)
env = create_environment("a")
for i in range(50):
    state = env.reset()
    cumulative_reward = 0
    done = False
    #print(cumulative_reward)
    while not done:
        action = test_agent.compute_action(state)
        #print(action)
        #action = env.action_space.sample()
        state, reward, done, _ = env.step(action)
        cumulative_reward += reward
    print("%#%#%#%#%#%Total Reward: {:}".format(cumulative_reward))

コード例 #5

0

ファイルを表示

ファイル: analysis_functions.py プロジェクト: ntalele/seagul

def render(checkpoint, home_path):
    """
    Renders pybullet and mujoco environments.
    """
    alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0)
    current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0)
    checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint)
    config = json.load(open(home_path + "params.json"))
    config_bin = pickle.load(open(home_path + "params.pkl", "rb"))
    ray.shutdown()
    import pybullet_envs
    ray.init()
    ModelCatalog.register_custom_model("RBF", RBFModel)
    ModelCatalog.register_custom_model("MLP_2_64", MLP)
    ModelCatalog.register_custom_model("linear", Linear)

    if alg == "PPO":
        trainer = ppo.PPOTrainer(config_bin)
    if alg == "SAC":
        trainer = sac.SACTrainer(config)
    if alg == "DDPG":
        trainer = ddpg.DDPGTrainer(config)
    if alg == "PG":
        trainer = pg.PGTrainer(config)
    if alg == "A3C":
        trainer = a3c.A3CTrainer(config)
    if alg == "TD3":
        trainer = td3.TD3Trainer(config)
    if alg == "ES":
        trainer = es.ESTrainer(config)
    if alg == "ARS":
        trainer = ars.ARSTrainer(config)
#   "normalize_actions": true,
    trainer.restore(checkpoint_path)

    if "Bullet" in current_env:
        env = gym.make(current_env, render=True)
    else:
        env = gym.make(current_env)
    #env.unwrapped.reset_model = det_reset_model
    env._max_episode_steps = 10000
    obs = env.reset()

    action_hist = []
    m_act_hist = []
    state_hist  = []
    obs_hist = []
    reward_hist = []

    done = False
    step = 0

    for t in range(10000):
        # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now
        # mean_actions = out_dict['behaviour_logits'][:17]
        # actions = trainer.compute_action(obs.flatten())
        # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True)
        sampled_actions = trainer.compute_action(obs.flatten())
        # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True)
        
        actions = sampled_actions
        
        obs, reward, done, _ = env.step(np.asarray(actions))
        # env.camera_adjust()
        env.render(mode='human')
        time.sleep(0.01)
        # env.render()
        # env.render(mode='rgb_array', close = True)
        # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0])

        # if step % 1000 == 0:
        #     env.reset()
        # step += 1
        
        action_hist.append(np.copy(actions))
        obs_hist.append(np.copy(obs))
        reward_hist.append(np.copy(reward))
        if done:
            obs = env.reset()
    # print(sum(reward_hist))
    # print((obs_hist))
    #plt.plot(action_hist)
    #plt.figure()
    #plt.figure()
    #plt.plot(obs_hist)
    #plt.figure()

    # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit)
    # trainer.compute_action(obs, full_fetch=True)
    trainer.compute_action(obs)