Beispiel #1
0
def try_ray_ppo_planning(cls, trial_count):
    with mlflow.start_run():
        ray.init()
        config = {
            "env": cls,
            "num_workers": 3,
            "num_gpus": 0,
            "horizon": 5,
            "train_batch_size":
            12000,  # val of 128 leads to ~1s per training iteration.
        }
        full_config = DEFAULT_CONFIG.copy()
        for k, v in config.items():
            full_config[k] = v
        pprint(full_config)
        agent = PPOTrainerWithReset(full_config)
        strategy_fn = get_strategy_function(cls, agent)
        strategy_fn.info = "Ray PPO Planning strategy"

        trial_result = do_trials(cls,
                                 trial_count,
                                 strategy_fn,
                                 max_steps_per_episode=10000,
                                 always_print=True)
        checkpoint = agent.save()
        print(f"checkpoint saved at {checkpoint}")
        mlflow.log_metrics(trial_result)
Beispiel #2
0
def tune_run(n_agents=3, episode_length=4000, config=None):
    ray.init()
    tf.compat.v1.enable_v2_behaviøor()
    # initialize trainer
    env = ASMEnv(n_agents=n_agents)
    register_env(
        "asm",
        lambda _: ASMEnv(n_agents=n_agents, episode_length=episode_length))
    policies = {
        "govt_policy":
        (PPOTFPolicy, env.observation_space, env.govt_action_space, {}),
    }
    for idx in range(n_agents):
        policies[f"citizen_policy_{idx}"] = (PPOTFPolicy,
                                             env.observation_space,
                                             env.citizen_action_space, {})
    if config is None:
        ppo_config = DEFAULT_CONFIG.copy()
    else:
        ppo_config = config
    ppo_config["env"] = "asm"
    ppo_config["train_batch_size"] = 400
    ppo_config["timesteps_per_iteration"] = episode_length
    ppo_config["multiagent"] = {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": list(policies.keys()),
    }
    tune.run("PPO", stop={"training_iteration": 100}, config=ppo_config)
Beispiel #3
0
    def _build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()
        trainer_config['num_workers'] = 0
        trainer_config["train_batch_size"] = 640
        trainer_config["sgd_minibatch_size"] = 64
        trainer_config["num_sgd_iter"] = 10
        trainer = PPOTrainer(trainer_config, self.env_class)

        return trainer
Beispiel #4
0
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRaySinglePlayerEnvironment(board_size=13,
                                              num_players=4,
                                              agent=agent)

    env = environment_creater()
    tune.register_env("tron_single_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    #trainer = DQNTrainer(config, "tron_single_player")
    trainer = PPOTrainer(config, "tron_single_player")

    trainer.restore("./ppo_checkpoint_201/checkpoint-201")

    return trainer  #.get_policy("trainer")
Beispiel #5
0
def defaultConfig(args):

    if args.agent == "PPO":
        config = PPO_CONFIG.copy()

    config['num_workers'] = args.workers
    config['num_gpus'] = 1
    config['framework'] = "torch"
    config['gamma'] = args.gamma

    # config['model']['dim'] = 21
    # config['model']['conv_filters'] = [ [8, [3, 3], 2],
    #                                     [16, [2, 2], 2],
    #                                     [512, [6, 6], 1]]

    return config
Beispiel #6
0
    def build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()

        trainer_config["num_workers"] = 0
        # trainer_config["train_batch_size"] = 640
        # trainer_config["sgd_minibatch_size"] = 160
        # trainer_config["num_sgd_iter"] = 100

        trainer_config["exploration_config"] = {
            "type": "Random",
        }
        # EpsilonGreedy(Exploration):
        # trainer_config["exploration_config"] = {
        #         "type": "Curiosity",
        #         "eta": 0.2,
        #         "lr": 0.001,
        #         "feature_dim": 128,
        #         "feature_net_config": {
        #             "fcnet_hiddens": [],
        #             "fcnet_activation": "relu",
        #         },
        #         "sub_exploration": {
        #             "type": "StochasticSampling",
        #         }
        #     }

        # trainer_config["log_level"] = "DEBUG"
        """
        if env_config is not None:
            for x in env_config.keys():
                trainer_config[x] = env_config[x]
        """

        # trainer_config["env_config"] = copy.deepcopy(env_config)  #  {"rules": "qiyang_role"}

        trainer_config.update(self.agent_config)

        self.trainer = PPOTrainer(trainer_config, self.agent_config["env"])
        # self.config["trainer"] = self.trainer
        return self.trainer
Beispiel #7
0
def main():
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 1
    config['num_gpus'] = 0
    config['num_cpus_per_worker'] = 0
    config["entropy_coeff"] = 0

    register_env("HiveEnv", lambda c: GymHive(c))
    agent = PPOTrainer(config, env='HiveEnv')

    results = []
    episode_data = []
    episode_json = []

    n_epochs = 1000
    for epoch in range(n_epochs):
        # for _ in range(1000):
        #     agent.render()
        #     agent.step(agent.compute_action())

        result = agent.train()
        results.append(result)

        episode = {
            'epoch': epoch,
            'episode_reward_min': result['episode_reward_min'],
            'episode_reward_mean': result['episode_reward_mean'],
            'episode_reward_max': result['episode_reward_max'],
            'episode_len_mean': result['episode_len_mean']
        }

        episode_data.append(episode)
        episode_json.append(json.dumps(episode))

        print(
            f'{epoch:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}'
        )
Beispiel #8
0
import gym
import ray
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print

ray.init(num_gpus=1)

config = DEFAULT_CONFIG.copy()
config['num_gpus'] = 1
config['num_workers'] = 1
config['num_sgd_iter'] = 30
config['sgd_minibatch_size'] = 128
config['model']['fcnet_hiddens'] = [100, 100]
config[
    'num_cpus_per_worker'] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed

agent = PPOTrainer(config, 'CartPole-v0')

for i in range(5):
    result = agent.train()
    print(pretty_print(result))
import gym
import ray.utils
from ray.tune.logger import pretty_print
from envs import env_config
from envs.ssa_tasker_simple_2 import SSA_Tasker_Env
import datetime
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG as PPO_CONFIG
import numpy as np
import pickle

ray.init()

env_config['rso_count'] = 20

config = PPO_CONFIG.copy()
config['num_gpus'] = 1
config['num_workers'] = 4
# !---- found that the network design from Jones's work had little effect in training
# config['model']['fcnet_hiddens'] = [180, 95, 50] # 10 RSOs
# config['model']['fcnet_hiddens'] = [360, 180, 100] # 20 RSOs
# config['model']['fcnet_hiddens'] = [720, 380, 200] # 40 RSOs
config['gamma'] = 0.99  # gamma  (float) Discount factor
config['rollout_fragment_length'] = 32
if env_config['rso_count'] == 40:
    #config['model']['fcnet_hiddens'] = [512, 512] # 40 RSOs
    config[
        'rollout_fragment_length'] = 128  # n_steps (int) The number of steps to run for each environment per update
# (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
config[
    'entropy_coeff'] = 0.01  # ent_coef (float) Entropy coefficient for the loss calculation
config[
Beispiel #10
0
        self.x = x_next
        reward = -1 * self.cost(x_next, action_vector)
        done = False
        if self.x[1] < 1:
            done = True
        return self.x, reward, done, {}

    def reset(self):
        self.f, self.x, self.cost = make_SIR_Treatement_model(self.S_0, self.I_0, alpha=self.alpha, beta=self.beta,
                                                              f=0.5, B=1)
        return self.x




trainer_config = DEFAULT_CONFIG.copy()
trainer_config['num_workers'] = 1
trainer_config["train_batch_size"] = 400
trainer_config["sgd_minibatch_size"] = 64
trainer_config["num_sgd_iter"] = 10




trainer = PPOTrainer(trainer_config, SIR);
for i in range(200):
    print("Training iteration {}...".format(i))
    trainer.train()


Beispiel #11
0
rllib_cfg[
    "log_level"] = "INFO"  # Set the ray.rllib.* log level for the agent process and its workers [DEBUG, INFO, WARN, or ERROR]
rllib_cfg[
    "log_sys_usage"] = True  # Monitor system resource metrics (requires `psutil` and `gputil`)
rllib_cfg[
    "metrics_smoothing_episodes"] = 100  # Smooth metrics over this many episodes
rllib_cfg[
    "collect_metrics_timeout"] = 180  # Wait for metric batches for this duration. If not in time, collect in the next train iteration.
rllib_cfg[
    "timesteps_per_iteration"] = 0  # Minimum env steps to optimize for per train call. It does not affect learning, only monitoring.

# ================== Configure learning algorithm ==================

# Copy the default learning algorithm configuration, including PPO-specific parameters,
# then overwrite the common parameters that has been updated ONLY.
agent_cfg = AGENT_DEFAULT_CONFIG.copy()
for key, value in rllib_cfg.items():
    if COMMON_CONFIG[key] != value:
        agent_cfg[key] = value

# Estimators settings
agent_cfg[
    "use_gae"] = True  # Use the Generalized Advantage Estimator (GAE) with a value function (https://arxiv.org/pdf/1506.02438.pdf)
agent_cfg[
    "use_critic"] = True  # Use a critic as a value baseline (otherwise don't use any; required for using GAE).
agent_cfg["lambda"] = 0.95  # The GAE(lambda) parameter.

# Learning settings
agent_cfg[
    "kl_coeff"] = 0.0  # Initial coefficient for KL divergence. (0.0 for L^CLIP)
agent_cfg["kl_target"] = 0.01  # Target value for KL divergence
Beispiel #12
0
 def testLocal(self):
     ray.init(local_mode=True)
     cf = DEFAULT_CONFIG.copy()
     agent = PPOAgent(cf, "CartPole-v0")
     print(agent.train())
Beispiel #13
0
 def test_local(self):
     cf = DEFAULT_CONFIG.copy()
     for _ in framework_iterator(cf):
         agent = PPOTrainer(cf, "CartPole-v0")
         print(agent.train())
         agent.stop()
 def test_local(self):
     cf = DEFAULT_CONFIG.copy()
     agent = PPOTrainer(cf, "CartPole-v0")
     print(agent.train())
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRayEnvironment(board_size=13, num_players=4)

    env = environment_creater()
    tune.register_env("tron_multi_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # All of the models will use the same network as before
    agent_config = {
        "model": {
            "vf_share_layers": True,
            "conv_filters": [(512, 5, 1), (256, 3, 2), (128, 3, 2)],
            "fcnet_hiddens": [256],
            "custom_preprocessor": 'tron_prep'
        }
    }

    def policy_mapping_function(x):
        if x == '0':
            return "trainer"
        return "opponent"

    config['multiagent'] = {
        "policy_mapping_fn": policy_mapping_function,
        "policies": {
            "trainer":
            (None, env.observation_space, env.action_space, agent_config),
            "opponent":
            (None, env.observation_space, env.action_space, agent_config)
        },
        "policies_to_train": ["trainer"]
    }

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    #trainer = MARWILTrainer(config, "tron_single_player")
    trainer = PPOTrainer(config, "tron_multi_player")

    trainer.restore("./sp_checkpoint_1802/checkpoint-1802")

    return trainer.get_policy("trainer")
Beispiel #16
0
def create_env_config(cli_args):
    """
    Create environment and RLlib config based on passed CLI args. Return config.

    :param cli_args: Parsed CLI args
    :return: The complete config for an RLlib agent, including the env & env_config
    """
    env_class = get_env_class(cli_args.agent)
    map, ue_list, bs_list = get_env(cli_args.env, cli_args.bs_dist, cli_args.static_ues, cli_args.slow_ues,
                                    cli_args.fast_ues, cli_args.sharing, cli_args.num_bs)

    # this is for DrEnv and step utility
    # env_config = {
    #     'episode_length': eps_length, 'seed': seed,
    #     'map': map, 'bs_list': bs_list, 'ue_list': ue_list, 'dr_cutoff': 'auto', 'sub_req_dr': True,
    #     'curr_dr_obs': False, 'ues_at_bs_obs': False, 'dist_obs': False, 'next_dist_obs': False
    # }
    # this is for the custom NormEnv and log utility
    env_config = {
        'episode_length': cli_args.eps_length, 'seed': cli_args.seed, 'map': map, 'bs_list': bs_list, 'ue_list': ue_list,
        'rand_episodes': cli_args.rand_train, 'new_ue_interval': cli_args.new_ue_interval, 'reward': cli_args.reward,
        'max_ues': cli_args.max_ues, 'ue_arrival': get_ue_arrival(cli_args.ue_arrival),
        # if enabled log_metrics: log metrics even during training --> visible on tensorboard
        # if disabled: log just during testing --> probably slightly faster training with less memory
        'log_metrics': True,
        # custom animation rendering
        'dashboard': cli_args.dashboard, 'ue_details': cli_args.ue_details,
    }
    # convert ue_arrival sequence to str keys as required by RLlib: https://github.com/ray-project/ray/issues/16215
    if env_config['ue_arrival'] is not None:
        env_config['ue_arrival'] = {str(k): v for k, v in env_config['ue_arrival'].items()}

    # create and return the config
    config = DEFAULT_CONFIG.copy()
    # discount factor (default 0.99)
    # config['gamma'] = 0.5
    # 0 = no workers/actors at all --> low overhead for short debugging; 2+ workers to accelerate long training
    config['num_workers'] = cli_args.workers
    config['seed'] = cli_args.seed
    # write training stats to file under ~/ray_results (default: False)
    config['monitor'] = True
    config['train_batch_size'] = cli_args.batch_size        # default: 4000; default in stable_baselines: 128
    # auto normalize obserations by subtracting mean and dividing by std (default: "NoFilter")
    # config['observation_filter'] = "MeanStdFilter"
    # NN settings: https://docs.ray.io/en/latest/rllib-models.html#built-in-model-parameters
    # configure the size of the neural network's hidden layers; default: [256, 256]
    # config['model']['fcnet_hiddens'] = [512, 512, 512]
    # LSTM settings
    config['model']['use_lstm'] = cli_args.lstm
    # config['model']['lstm_use_prev_action_reward'] = True
    # config['log_level'] = 'INFO'    # ray logging default: warning
    # reset the env whenever the horizon/eps_length is reached
    config['horizon'] = cli_args.eps_length
    config['env'] = env_class
    config['env_config'] = env_config
    # callback for monitoring custom metrics
    config['callbacks'] = CustomMetricCallbacks
    config['log_level'] = 'ERROR'

    # for multi-agent env: https://docs.ray.io/en/latest/rllib-env.html#multi-agent-and-hierarchical
    if MultiAgentEnv in env_class.__mro__:
        # instantiate env to access obs and action space and num diff UEs
        env = env_class(env_config)

        # use separate policies (and NNs) for each agent
        if cli_args.separate_agent_nns:
            num_diff_ues = env.get_num_diff_ues()
            # create policies also for all future UEs
            if num_diff_ues > env.num_ue:
                log.warning("Varying num. UEs. Creating policy for all (future) UEs.",
                            curr_num_ue=env.num_ue, num_diff_ues=num_diff_ues, new_ue_interval=env.new_ue_interval,
                            ue_arrival=env.ue_arrival)
                ue_ids = [str(i + 1) for i in range(num_diff_ues)]
            else:
                ue_ids = [ue.id for ue in ue_list]

            config['multiagent'] = {
                # attention: ue.id needs to be a string! just casting it to str() here doesn't work;
                # needs to be consistent with obs keys --> easier, just use string IDs
                'policies': {ue_id: (None, env.observation_space, env.action_space, {}) for ue_id in ue_ids},
                'policy_mapping_fn': lambda agent_id: agent_id
            }
        # or: all UEs use the same policy and NN
        else:
            config['multiagent'] = {
                'policies': {'ue': (None, env.observation_space, env.action_space, {})},
                'policy_mapping_fn': lambda agent_id: 'ue'
            }

    return config