Exemple #1
0
    def _setup(self, config):
        self.config = config
        self.env = config['env']
        agent_config = self.config
        adv_config = deepcopy(self.config)
        agent_config['multiagent']['policies_to_train'] = ['agent']
        adv_config['multiagent']['policies_to_train'] = ['adversary0']

        self.agent_trainer = PPOTrainer(env=self.env, config=agent_config)
        self.adv_trainer = PPOTrainer(env=self.env, config=adv_config)
    def my_train_fn(config, reporter):
        assert args.num_learners >= 4, 'Requires 4 or more trainable agents'
        ppo_trainer = PPOTrainer(env='c4', config=config)
        while True:
            result = ppo_trainer.train()
            if 'evaluation' in result:
                train_policies = config['multiagent']['policies_to_train']
                scores = {k: v for k, v in result['evaluation']['policy_reward_mean'].items() if k in train_policies}

                scores_dist = softmax(np.array(list(scores.values())) / tau)
                new_trainables = random.choices(list(scores.keys()), scores_dist, k=len(scores))
                # new_trainables = train_policies
                # random.shuffle(new_trainables)

                weights = ppo_trainer.get_weights()
                new_weights = {old_pid: weights[new_pid] for old_pid, new_pid in zip(weights.keys(), new_trainables)}
                # new_weights = {pid: np.zeros_like(wt) for pid, wt in weights.items() if wt is not None}
                # new_weights = {pid: np.ones_like(wt)*-100 for pid, wt in weights.items() if wt is not None}
                # new_weights = {pid: np.random.rand(*wt.shape) for pid, wt in weights.items() if wt is not None}

                print('\n\n################\nSETTING WEIGHTS\n################\n\n')
                ppo_trainer.set_weights(new_weights)

                num_metrics = 4
                c = Counter(new_trainables)
                result['custom_metrics'].update(
                    {f'most_common{i:02d}': v[1] for i, v in enumerate(c.most_common(num_metrics))})
                result['custom_metrics'].update(
                    {f'scores_dist{i:02d}': v for i, v in enumerate(sorted(scores_dist, reverse=True)[:num_metrics])})
                print('scores_dist', scores_dist)
                # result['custom_metrics'].update(
                #     {f'new_agent{i:02d}': int(v[-2:]) for i, v in enumerate(new_trainables)})
            reporter(**result)
Exemple #3
0
def get_ppo_train(name, pols, env, logdir, gamma, shape, lr, batch_size):
    config = {
        "gamma": gamma,
        "sample_batch_size": batch_size,
        "lr": lr,
        "multiagent": {
            "policies": pols,
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": [name],
        },
        "model": {
            "fcnet_activation": "tanh",
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": shape,
        },
        # disable filters, otherwise we would need to synchronize those
        # as well to the DQN agent
        "observation_filter": "NoFilter",
        "callbacks": {
            "on_train_result": on_episode_end
        }
    }
    return PPOTrainer(env=env,
                      config=config,
                      logger_creator=lambda _: UnifiedLogger(config, logdir))
Exemple #4
0
def jim_load(path, env_params, jim=True):
    if not jim:
        # Read in params used to create trainer
        config_path = os.path.join(os.path.dirname(path), "config.pkl")
        with open(config_path, "rb") as f:
            # We use dill (instead of pickle) here because we must deserialize functions
            config = dill.load(f)
        config['rllib_params']['env_config'] = env_params
    else:
        model_params = {
            "NUM_HIDDEN_LAYERS": 0,
            "SIZE_HIDDEN_LAYERS": 256,
            "NUM_FILTERS": 64,
            "NUM_CONV_LAYERS": 3
        }
        config = {
            "model": {
                "custom_model_config": model_params,
                "custom_model": RllibDQNModel
            },
            "gamma": 0.995,
            "framework": "torch",
            "env_config": env_params,
            "hiddens": [256, 256],
            "output": 'brawl-training/results',
            "lr": 1e-4,
            "v_min": -300.0,
            "v_max": 300.0,
            "noisy": True,
            "sigma0": 0.2,
            "n_step": 5,
            "exploration_config": {
                "type": "EpsilonGreedy",
                "initial_epsilon": 1.0,
                "final_epsilon": 0.01,
                "epsilon_timesteps": 200000
            }
        }
    ray.shutdown()
    ray.init()

    def env_creator(env_config):
        return SSBMEnv(**env_config)

    register_env("SSBM", env_creator)

    if jim:
        trainer = dqn.DQNTrainer(env="SSBM", config=config)
    else:
        trainer = PPOTrainer(env="melee", config=config['rllib_params'])

    trainer.restore(path)
    return trainer
Exemple #5
0
def main(checkpoint):
    env = CustomKukaEnv(dict(renders=True, isDiscrete=False,
                             maxSteps=10000000))

    class EnvPlaceholder(gym.Env):
        def __init__(self, env_config):
            super(EnvPlaceholder, self).__init__()
            self.observation_space = env.observation_space
            self.action_space = env.action_space

    trainer = PPOTrainer(config=config, env=EnvPlaceholder)

    trainer.restore(checkpoint)
    done = False
    i = 0
    while not done:
        time.sleep(0.01)
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        obs = env.getExtendedObservation()
        print(i)
        print(f"Action: {action}")
        print(f"Observation: {obs}")
        i += 1
Exemple #6
0
    def __init__(self,
                 ticker_list,
                 time_interval,
                 drl_lib,
                 agent,
                 cwd,
                 net_dim,
                 state_dim,
                 action_dim,
                 API_KEY,
                 API_SECRET,
                 APCA_API_BASE_URL,
                 tech_indicator_list,
                 turbulence_thresh=30,
                 max_stock=1e2,
                 latency=None):
        #load agent
        self.drl_lib = drl_lib
        if agent == 'ppo':
            if drl_lib == 'elegantrl':
                from elegantrl.agent import AgentPPO
                from elegantrl.run import Arguments, init_agent
                #load agent
                config = {
                    'state_dim': state_dim,
                    'action_dim': action_dim,
                }
                args = Arguments(agent=AgentPPO, env=StockEnvEmpty(config))
                args.cwd = cwd
                args.net_dim = net_dim
                # load agent
                try:
                    agent = init_agent(args, gpu_id=0)
                    self.act = agent.act
                    self.device = agent.device
                except BaseException:
                    raise ValueError("Fail to load agent!")

            elif drl_lib == 'rllib':
                from ray.rllib.agents import ppo
                from ray.rllib.agents.ppo.ppo import PPOTrainer

                config = ppo.DEFAULT_CONFIG.copy()
                config['env'] = StockEnvEmpty
                config["log_level"] = "WARN"
                config['env_config'] = {
                    'state_dim': state_dim,
                    'action_dim': action_dim,
                }
                trainer = PPOTrainer(env=StockEnvEmpty, config=config)
                trainer.restore(cwd)
                try:
                    trainer.restore(cwd)
                    self.agent = trainer
                    print("Restoring from checkpoint path", cwd)
                except:
                    raise ValueError('Fail to load agent!')

            elif drl_lib == 'stable_baselines3':
                from stable_baselines3 import PPO

                try:
                    #load agent
                    self.model = PPO.load(cwd)
                    print("Successfully load model", cwd)
                except:
                    raise ValueError('Fail to load agent!')

            else:
                raise ValueError(
                    'The DRL library input is NOT supported yet. Please check your input.'
                )

        else:
            raise ValueError('Agent input is NOT supported yet.')

        #connect to Alpaca trading API
        try:
            self.alpaca = tradeapi.REST(API_KEY, API_SECRET, APCA_API_BASE_URL,
                                        'v2')
        except:
            raise ValueError(
                'Fail to connect Alpaca. Please check account info and internet connection.'
            )

        #read trading time interval
        if time_interval == '1s':
            self.time_interval = 1
        elif time_interval == '5s':
            self.time_interval = 5
        elif time_interval == '1Min':
            self.time_interval = 60
        elif time_interval == '5Min':
            self.time_interval = 60 * 5
        elif time_interval == '15Min':
            self.time_interval = 60 * 15
        else:
            raise ValueError('Time interval input is NOT supported yet.')

        #read trading settings
        self.tech_indicator_list = tech_indicator_list
        self.turbulence_thresh = turbulence_thresh
        self.max_stock = max_stock

        #initialize account
        self.stocks = np.asarray([0] * len(ticker_list))  #stocks holding
        self.stocks_cd = np.zeros_like(self.stocks)
        self.cash = None  #cash record
        self.stocks_df = pd.DataFrame(self.stocks,
                                      columns=['stocks'],
                                      index=ticker_list)
        self.asset_list = []
        self.price = np.asarray([0] * len(ticker_list))
        self.stockUniverse = ticker_list
        self.turbulence_bool = 0
        self.equities = []
        "num_workers": 0
    }

    dqn_config = {
        "timesteps_per_iteration": 1000,
        "model": {
            "custom_model": "ParametricActionsModel"
        },
        "num_workers": 0,
        "hiddens": [],
        "dueling": False,
        "v_min": -5,
        "v_max": 135,
        #"noisy": True
    }

    #trainer = DQNTrainer(env="ExternalMkt", config=dqn_config)
    trainer = PPOTrainer(env="ExternalMkt", config=ppo_config)

    i = 1
    for _ in range(20):
        result = trainer.train()
        print(
            "Iteration {}, Episodes {}, Mean Reward {}, Mean Length {}".format(
                i, result['episodes_this_iter'], result['episode_reward_mean'],
                result['episode_len_mean']))
        i += 1
    i = 1

    ray.shutdown()
Exemple #8
0
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, A2C

import gym
import traf_env
import multi_traf_env
import ray
from ray import tune
from ray.rllib.policy import Policy
from ray.rllib.tests.test_multi_agent_env import MultiCartpole
from ray.tune.registry import register_env
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy

ray.init()
register_env("multi_air-v0",
             lambda c: multi_traf_env.AirTrafficGym(num_agents=2))
trainer = PPOTrainer(env="multi_air-v0")
num_train_itr = 50
for i in range(num_train_itr):
    print("****************************Iteration: ", i,
          "****************************")
    print(trainer.train())
Exemple #9
0
  ppo_trainer = PPOTrainer(
      env="gfootball",
      config={
        "framework": "torch",
    # Should use a critic as a baseline (otherwise don't use value baseline;
    # required for using GAE).
    "use_critic": True,
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    'observation_filter': 'NoFilter',
          # "_fake_gpus": True,
          "num_workers": 8,
          "num_envs_per_worker": 4,
          "num_gpus": 1, #Driver GPU
          # "num_gpus_per_worker": 0.05, #GPU/worker 
          "ignore_worker_failures": True,
          "train_batch_size": 6400,
          # 'sample_batch_size': 100,
          'rollout_fragment_length': 200, #350, #512, #200, 300
          "sgd_minibatch_size": 500,
          # 'num_sgd_iter': 10,
          "lr": 2.5e-4, #2.5e-4, #2.5e-3, #2.5e-4, # 3e-4,
          "lambda": .95,
          "gamma": .998,
          "entropy_coeff": 1e-4, # try 0.01
          "kl_coeff": 1.0,
          "clip_param": 0.2,
          "num_sgd_iter": 10,
          "vf_share_layers": True, # share layers
          "vf_clip_param": 10.0, 
          "vf_loss_coeff": 10, #1.0,0.5 #Tune this to scale the loss
          "model": {
            "custom_model": "my_model"

          },
          "multiagent": {
              "policies": {
                  "policy_01": (None, obs_space, act_space, {}),
                  "policy_02": (None, obs_space, act_space, {}),
                  "policy_03": (None, obs_space, act_space, {}),
                  "policy_04": (None, obs_space, act_space, {})

              },
              "policy_mapping_fn": tune.function(policy_mapping_fn),
              "policies_to_train": ["policy_01"]
          },
# Uncomment to turn on evaluation intervals while training
        # "evaluation_interval": 50,
        # "evaluation_config": {
        #   "env_config": {
        #     # Use test set to evaluate
        #     'mode': 'test'
        #   },
        #     "explore": False,
        #     "multiagent": {
        #         "policies": {
        #             # the first tuple value is None -> uses default policy class
        #             "policy_01": (None, obs_space, act_space, {}),
        #             "policy_02": (None, obs_space, act_space, {}),
        #             "policy_03": (None, obs_space, act_space, {}),
        #             "policy_04": (None, obs_space, act_space, {}),
        #         },
        #         "policy_mapping_fn": policy_mapping_fn_eval
        #     },
        # },

      })
Exemple #10
0
def gen_trainer_from_params(params):
    # All ray environment set-up
    if not ray.is_initialized():
        ray.init(ignore_reinit_error=True, include_webui=False, temp_dir=params['ray_params']['temp_dir'])
    register_env("overcooked_multi_agent", params['ray_params']['env_creator'])
    ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls'])

    # Parse params
    training_params = params['training_params']
    environment_params = params['environment_params']
    evaluation_params = params['evaluation_params']
    multi_agent_params = params['environment_params']['multi_agent_params']
    agent_params = params["agent_params"] # only ml based agents

    env = OvercookedMultiAgent.from_config(environment_params)

    # Returns a properly formatted policy tuple to be passed into ppotrainer config
    def gen_policy(policy_type="ppo"):
        return (
            agent_params[policy_type].get("policy_cls"),
            env.observation_spaces[policy_type],
            env.action_space,
            agent_params[policy_type]["config"]
            )

    # Rllib compatible way of setting the directory we store agent checkpoints in
    logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr)
    def custom_logger_creator(config):
        """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp>
        """
        results_dir = params['results_dir']
        if not os.path.exists(results_dir):
            try:
                os.makedirs(results_dir)
            except Exception as e:
                print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR))
                results_dir = DEFAULT_RESULTS_DIR
        logdir = tempfile.mkdtemp(
            prefix=logdir_prefix, dir=results_dir)
        logger = UnifiedLogger(config, logdir, loggers=None)
        return logger

    if "outer_shape" not in environment_params:
        environment_params["outer_shape"] = None

    if "mdp_params" in environment_params:
        environment_params["eval_mdp_params"] = environment_params["mdp_params"]
    
    # Create rllib compatible multi-agent config based on params
    multi_agent_config = {}

    if multi_agent_params.get('bc_schedule'):
        agents_schedule = OvercookedMultiAgent.bc_schedule_to_agents_schedule(multi_agent_params['bc_schedule'])
    else:
        agents_schedule = multi_agent_params['agents_schedule']
    all_policies = OvercookedMultiAgent.agents_from_schedule(agents_schedule)
    ml_policies = [p for p in all_policies if OvercookedMultiAgent.is_ml_agent(p)]

    multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in ml_policies }
    
    def select_policy(agent_id):
        return OvercookedMultiAgent.agent_id_to_agent_name(agent_id)

    multi_agent_config['policy_mapping_fn'] = select_policy
    multi_agent_config['policies_to_train'] = 'ppo'

    eval_function = get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'],
        environment_params['env_params'], environment_params["outer_shape"], multi_agent_params["featurize_fns"], shuffle=multi_agent_params["shuffle_agents"],
        )

    trainer = PPOTrainer(env="overcooked_multi_agent", config={
        "multiagent": multi_agent_config,
        "callbacks" : TrainingCallbacks,
        "custom_eval_function" : eval_function,
        "env_config" : environment_params,
        "eager" : False,
        **training_params
    }, logger_creator=custom_logger_creator)
    return trainer
Exemple #11
0
 ray.init()
 num_policies = 4
 policies = {
     "policy_{}".format(i): (None, env.observation_space, env.action_space, {})
     for i in range(num_policies)
 }
 policy_ids = list(policies.keys())
 config = {
     "multiagent": {
         "policies": policies,
         "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
     },
     "framework": "tf",
 }
 #trainer = ApexTrainer(env=TicTacToe, config=config)
 trainer = PPOTrainer(env=TicTacToe, config=config)
 trainer.restore("ttt_model/checkpoint_51/checkpoint-51")
 obs = env.reset()
 print(obs)
 done = False
 while not done:
     env.render()
     player = list(obs)[0]
     if player == "X":
         action = int(input(f"Player {player} - enter action 1-9:")) - 1
     else:
         action = trainer.compute_action(np.array(obs["O"]), policy_id="policy_1")
     obs, rewards, dones, infos = env.step({player: action})
     done = dones["__all__"]
     print(obs, rewards, dones, infos)
 env.render()
config["lr"] = 0.001
config["num_sgd_iter"] = 5
config["sgd_minibatch_size"] = 8192
config["train_batch_size"] = 20000
config["use_gae"] = True
config["vf_clip_param"] = 10
config["vf_loss_coeff"] = 1
config["vf_share_layers"] = False

# For better gradient estimates in the later stages
# of the training, increase the batch sizes.
# config["sgd_minibatch_size"] = 8192 * 4
# config["train_batch_size"] = 20000 * 10

ray.init()
trainer = PPOTrainer(config=config, env=InventoryEnv)

# Use this when you want to continue from a checkpoint.
# trainer.restore(
#   "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-31-2945lwn1wg/checkpoint_737/checkpoint-737"
# )

best_mean_reward = np.NINF
while True:
    result = trainer.train()
    print(pretty_print(result))
    mean_reward = result.get("episode_reward_mean", np.NINF)
    if mean_reward > best_mean_reward:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)
        best_mean_reward = mean_reward
def gen_trainer_from_params(params):
    # All ray environment set-up
    if not ray.is_initialized():
        init_params = {
            "ignore_reinit_error":
            True,
            "include_webui":
            False,
            "temp_dir":
            params['ray_params']['temp_dir'],
            "log_to_driver":
            params['verbose'],
            "logging_level":
            logging.INFO if params['verbose'] else logging.CRITICAL
        }
        ray.init(**init_params)
    register_env("overcooked_multi_agent", params['ray_params']['env_creator'])
    ModelCatalog.register_custom_model(
        params['ray_params']['custom_model_id'],
        params['ray_params']['custom_model_cls'])

    # Parse params
    model_params = params['model_params']
    training_params = params['training_params']
    environment_params = params['environment_params']
    evaluation_params = params['evaluation_params']
    bc_params = params['bc_params']
    multi_agent_params = params['environment_params']['multi_agent_params']

    env = OvercookedMultiAgent.from_config(environment_params)

    # Returns a properly formatted policy tuple to be passed into ppotrainer config
    def gen_policy(policy_type="ppo"):
        # supported policy types thus far
        assert policy_type in ["ppo", "bc"]

        if policy_type == "ppo":
            config = {
                "model": {
                    "custom_options": model_params,
                    "custom_model": "MyPPOModel"
                }
            }
            return (None, env.ppo_observation_space, env.action_space, config)
        elif policy_type == "bc":
            bc_cls = bc_params['bc_policy_cls']
            bc_config = bc_params['bc_config']
            return (bc_cls, env.bc_observation_space, env.action_space,
                    bc_config)

    # Rllib compatible way of setting the directory we store agent checkpoints in
    logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"],
                                         params['training_params']['seed'],
                                         timestr)

    def custom_logger_creator(config):
        """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp>
                """
        results_dir = params['results_dir']
        if not os.path.exists(results_dir):
            try:
                os.makedirs(results_dir)
            except Exception as e:
                print(
                    "error creating custom logging dir. Falling back to default logdir {}"
                    .format(DEFAULT_RESULTS_DIR))
                results_dir = DEFAULT_RESULTS_DIR
        logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=results_dir)
        logger = UnifiedLogger(config, logdir, loggers=None)
        return logger

    # Create rllib compatible multi-agent config based on params
    multi_agent_config = {}
    all_policies = ['ppo']

    # Whether both agents should be learned
    self_play = iterable_equal(multi_agent_params['bc_schedule'],
                               OvercookedMultiAgent.self_play_bc_schedule)
    if not self_play:
        all_policies.append('bc')

    multi_agent_config['policies'] = {
        policy: gen_policy(policy)
        for policy in all_policies
    }

    def select_policy(agent_id):
        if agent_id.startswith('ppo'):
            return 'ppo'
        if agent_id.startswith('bc'):
            return 'bc'

    multi_agent_config['policy_mapping_fn'] = select_policy
    multi_agent_config['policies_to_train'] = 'ppo'

    if "outer_shape" not in environment_params:
        environment_params["outer_shape"] = None

    if "mdp_params" in environment_params:
        environment_params["eval_mdp_params"] = environment_params[
            "mdp_params"]
    trainer = PPOTrainer(env="overcooked_multi_agent",
                         config={
                             "multiagent":
                             multi_agent_config,
                             "callbacks":
                             TrainingCallbacks,
                             "custom_eval_function":
                             get_rllib_eval_function(
                                 evaluation_params,
                                 environment_params['eval_mdp_params'],
                                 environment_params['env_params'],
                                 environment_params["outer_shape"],
                                 'ppo',
                                 'ppo' if self_play else 'bc',
                                 verbose=params['verbose']),
                             "env_config":
                             environment_params,
                             "eager":
                             False,
                             **training_params
                         },
                         logger_creator=custom_logger_creator)
    return trainer
Exemple #14
0
    config = {
        "num_gpus": 1,
        "env": "yaniv",
        "env_config": env_config,
        "framework": "torch",
        "multiagent": {
            "policies": {
                "policy_1": (None, obs_space, act_space, {}),
                "policy_2": (None, obs_space, act_space, {}),
                "policy_3": (None, obs_space, act_space, {}),
                "policy_4": (None, obs_space, act_space, {}),
            },
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["policy_1"],

        },
        "model": {
            "custom_model": "yaniv_mask",
            "fcnet_hiddens": [512, 512],
        },
    }

    ray.init(include_dashboard=False)

    trainer = PPOTrainer(env="yaniv", config=config)
    trainer.restore(args.checkpoint)

    tourny = YanivTournament(env_config, trainers=[trainer])
    tourny.run(args.eval_num)
    print("\n\nRESULTS:\n")
    tourny.print_stats()
Exemple #15
0
def main():
    """main function"""

    ray.init()

    if 'NUM_WORKERS' in os.environ:
        num_of_workers = int(os.environ['NUM_WORKERS'])
    else:
        num_of_workers = DEFAULT_NUM_WORKERS

    if os.path.isfile(WORLDS_JSON_PATH):
        with open(WORLDS_JSON_PATH) as jsonfile:
            dict_worlds = json.load(jsonfile)
    else:
        dict_worlds = None

    if os.path.isfile(MASTER_URI_JSON_PATH):
        with open(MASTER_URI_JSON_PATH) as jsonfile:
            list_master_uri = json.load(jsonfile)['master_uri']
    else:
        list_master_uri = None

    config = ppo.DEFAULT_CONFIG.copy()
    config.update({
        'env_config': {
            'dict_worlds': dict_worlds,
            'list_master_uri': list_master_uri,  # 병렬 시뮬레이션 수행 스크립트 사용할 때
            # 'list_master_uri': None, # 기본 ROS 마스터 URI로 시뮬레이션 1개만 돌릴 떄
            'use_random_heading': True,
            'result_csv': RESULT_CSV_NAME,
            'num_workers': num_of_workers
        },
        'num_gpus': 0,  # 사용하는 GPU 수에 맞게 설정
        'num_workers': num_of_workers,
        'train_batch_size': 10000,
        'batch_mode': 'complete_episodes'
    })

    register_env('gazebo', lambda cfg: DroneSimEnv(cfg))
    trainer = PPOTrainer(env='gazebo', config=config)
    num_iteration = 10000

    latest_index = 0
    checkpoint_path = None
    checkpoint_name = None
    for name in [
            name for name in os.listdir(CHECKPOINT_PATH_BASE)
            if 'checkpoint_' in name
    ]:
        index = int(name.replace('checkpoint_', ''))
        if index > latest_index:
            latest_index = index
            checkpoint_path = CHECKPOINT_PATH_BASE + name + '/'
            checkpoint_name = 'checkpoint-' + str(index)
    if checkpoint_name:
        print('Running using (', checkpoint_name, ').')
        trainer.restore(checkpoint_path + checkpoint_name)

    print(checkpoint_name, '==========================================')

    ## goal/collision data init
    success_cnt = 0

    goal_rate_filename = 'goal_rate_{}.csv'.format(
        WORLDS_JSON_NAME.replace('curriculum/', '').replace('.json', ''))

    if not os.path.isfile(goal_rate_filename):
        with open(goal_rate_filename, 'w') as goal_rate_logfile:
            goal_rate_logfile.write("training_iteration,goal_rate\n")

    while True:
        ## goal/collision data create
        with open(RESULT_CSV_NAME, 'w+') as file_:
            pass

        result = trainer.train()
        print(pretty_print(result))

        # 복구용 체크포인트는 5 iteration 마다 저장
        if result['training_iteration'] % 5 == 0:
            checkpoint = trainer.save(CHECKPOINT_PATH_BASE)
            print("checkpoint saved at", checkpoint)

        # 결과 확인용 체크포인트는 100 iteration 마다 저장
        if result['training_iteration'] % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)

        ## goal/collision data read
        with open(RESULT_CSV_NAME, 'r') as file_:
            episodes_raw = file_.read()
            goal_list = episodes_raw.split(',')
            goal_cnt = goal_list.count('1')
            if goal_cnt == 0:
                goal_ratio = 0
            else:
                goal_ratio = goal_cnt / (goal_cnt + goal_list.count('0'))
            print('goal rate:', goal_ratio)
            with open(goal_rate_filename, 'a') as goal_rate_logfile:
                goal_rate_logfile.write(
                    str(result['training_iteration']) + ',' + str(goal_ratio) +
                    '\n')

        if goal_ratio >= 0.95:
            success_cnt += 1
            print('success in raw:', success_cnt)
        else:
            success_cnt = 0

        if success_cnt >= 5 and EXIT_ON_SUCCESS:
            if result['training_iteration'] % 5 != 0:
                checkpoint = trainer.save(CHECKPOINT_PATH_BASE)
                print("checkpoint saved at", checkpoint)
                break

        if result['training_iteration'] >= num_iteration:
            break

    print('PPO training is done.')
def get_trainer_from_params(params):
    return PPOTrainer(env=params['env'], config=params['rllib_params'])
Exemple #17
0
 ppo_trainer = PPOTrainer(
     env="gfootball",
     config={
         "framework": "torch",
         # Should use a critic as a baseline (otherwise don't use value baseline;
         # required for using GAE).
         "use_critic": True,
         # If true, use the Generalized Advantage Estimator (GAE)
         # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
         "use_gae": True,
         'simple_optimizer': args.simple,
         'observation_filter': 'NoFilter',
         "num_envs_per_worker": 1,
         "num_gpus": 1,
         "ignore_worker_failures": True,
         "train_batch_size": 4000,
         'rollout_fragment_length': 512,
         "sgd_minibatch_size": 500,
         "lr": 3e-4,
         "lambda": .95,
         "gamma": .998,
         "entropy_coeff": 1e-4,
         "kl_coeff": 1.0,
         "clip_param": 0.2,
         "num_sgd_iter": 10,
         "vf_share_layers": True,  #?? True?
         "vf_clip_param": 100.0,
         "model": {
             "custom_model": "my_model"
         },
         "multiagent": {
             "policies": {
                 "policy_01": (None, obs_space, act_space, {}),
                 "policy_02": (None, obs_space, act_space, {}),
                 "policy_03": (None, obs_space, act_space, {}),
                 "policy_04": (None, obs_space, act_space, {})
             },
             "policy_mapping_fn": tune.function(policy_mapping_fn),
             "policies_to_train": ["policy_01"]
         },
         "evaluation_interval": 50,
         "evaluation_config": {
             "env_config": {
                 # Use test set to evaluate
                 'mode': 'test'
             },
             "explore": False,
             "multiagent": {
                 "policies": {
                     # the first tuple value is None -> uses default policy class
                     "policy_01": (None, obs_space, act_space, {}),
                     "policy_02": (None, obs_space, act_space, {}),
                     "policy_03": (None, obs_space, act_space, {}),
                     "policy_04": (None, obs_space, act_space, {})
                 },
                 "policy_mapping_fn": policy_mapping_fn_eval
             },
         },
     })
  
  # Proximal Policy Optimization (PPO)
  print("Training algorithm: Proximal Policy Optimization (PPO)")
  
  trainer = PPOTrainer(
              env=env_title,
              config={
                "num_workers": num_workers,
                "num_cpus_per_worker": num_cpus_per_worker,
                "num_gpus": num_gpus,
                "num_gpus_per_worker": num_gpus_per_worker,
                "model": nw_model,
                "lr": lr,
                "gamma": gamma,
                "lambda": lambda_trainer,
                "multiagent": {
                  "policy_graphs": policy_graphs,
                  "policy_mapping_fn": policy_mapping_fn,
                  "policies_to_train": ["agent_policy{}".format(i) for i in range(n_agents)],
                },
                "callbacks": {
                  "on_episode_start": tune.function(on_episode_start),
                  "on_episode_step": tune.function(on_episode_step),
                  "on_episode_end": tune.function(on_episode_end),
                },
                "log_level": "ERROR",
              })
              
elif(train_algo == "A2C"):
  
  # A2C
    def my_train_fn(config, reporter):
        active_policy = None
        threshold = 0.7
        trainer_updates = []

        # ppo_trainer = MyPPOTrainer(env='c4', config=config)
        ppo_trainer = PPOTrainer(env='c4', config=config)
        bandit = Exp3Bandit(len(trainable_policies))

        def func(worker):
            worker.sampler.policy_mapping_fn = learned_vs_random_mapping_fn
            foo = 1

        # ppo_trainer.workers.foreach_worker(lambda w: w.sampler.policy_mapping_fn)
        ppo_trainer.workers.foreach_worker(func)

        # trainable_policies = ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0][:]
        # trainable_policies = ppo_trainer.workers.foreach_worker(
        #     lambda w: w.foreach_trainable_policy(lambda p, i: (i, p))
        # )

        while True:
            result = ppo_trainer.train()
            reporter(**result)

            foo = 1

            timestep = result['timesteps_total']
            training_iteration = result['training_iteration']
            # print('\n')
            # print('$$$$$$$$$$$$$$$$$$$$$$$')
            # print('timestep: {:,}'.format(timestep))
            # print('trainable_policies: %s' % trainable_policies)
            # if active_policy is None and timestep > int(5e6):
            # # if active_policy is None and timestep > int(25e4):
            #     active_policy = trainable_policies[0]
            #     # ppo_trainer.workers.foreach_worker(
            #     #     lambda w: w.foreach_trainable_policy(lambda p, i: (i, p))
            #     # )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[1])
            #     )
            #     trainer_updates.append(timestep)
            # elif active_policy == trainable_policies[0] \
            #         and result['policy_reward_mean'][trainable_policies[0]] > threshold:
            #     active_policy = trainable_policies[1]
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[0])
            #     )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.append(trainable_policies[1])
            #     )
            #     trainer_updates.append(timestep)
            # elif active_policy == trainable_policies[1] \
            #         and result['policy_reward_mean'][trainable_policies[1]] > threshold:
            #     active_policy = trainable_policies[0]
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.remove(trainable_policies[1])
            #     )
            #     ppo_trainer.workers.foreach_worker(
            #         lambda w: w.policies_to_train.append(trainable_policies[0])
            #     )
            #     trainer_updates.append(timestep)
            #
            # print('active_policy: %s' % active_policy)
            # print('worker TPs: %s' % ppo_trainer.workers.foreach_worker(lambda w: w.policies_to_train)[0])
            # print('trainer updates: %s' % '\n  - ' + '\n  - '.join('{:,}'.format(tu) for tu in trainer_updates))
            # print('$$$$$$$$$$$$$$$$$$$$$$$')
            # print('\n')

            # if timestep > int(1e6):
            # if training_iteration >= 20:
            #     break

            # if result['episode_reward_mean'] > 200:
            #     phase = 2
            # elif result['episode_reward_mean'] > 100:
            #     phase = 1
            # else:
            #     phase = 0
            # ppo_trainer.workers.foreach_worker(
            #     lambda ev: ev.foreach_env(
            #         lambda env: env.set_phase(phase)))

        state = ppo_trainer.save()
        ppo_trainer.stop()
        "dqn_policy": (DQNTFPolicy, obs_space, act_space, {}),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
            return "dqn_policy"

    ppo_trainer = PPOTrainer(
        env="multi_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            "explore": False,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
        })

    dqn_trainer = DQNTrainer(env="multi_cartpole",
                             config={
                                 "multiagent": {
                                     "policies": policies,
                                     "policy_mapping_fn": policy_mapping_fn,
                                     "policies_to_train": ["dqn_policy"],
                                 },
                                 "gamma": 0.95,
 def _train(self):
     ppo_trainer = PPOTrainer(env='c4', config=self.config)
     while True:
         result = ppo_trainer.train()
         # reporter(**result)
         print('ran iteration')
Exemple #22
0
ray.init()
results = {}
N = 100
config["num_workers"] = 1
config["num_gpus"] = 0

# You may have to run each agents in separate sessions
# to avoid PyBullet restrictions
agent = "ALP-GMM"
# agent = "Manual"
# agent = "No Curriculum"

print(f"Evaluating agent: {agent}")
results[agent] = []
trainer = PPOTrainer(config=config, env=envs[agent])
trainer.restore(agents[agent])
env = envs[agent](dict(config["env_config"], **{"in_training": False}))
for i in range(N):
    print(agent, i)
    done = False
    obs = env.reset()
    ep_reward = 0
    while not done:
        action = trainer.compute_action(obs)
        obs, reward, done, info = env.step(action)
        ep_reward += reward
        if done:
            obs = env.reset()
            results[agent].append(ep_reward)
print(f"Agent {agent} score: {np.round(np.mean(results[agent]), 2)}")

def policy_mapping_fn(agent_id):
    if agent_id[0] % 2 == 0:
        return "ppo_policy_atk"
    return "ppo_policy_def"


pdb.set_trace()

ppo_trainer = PPOTrainer(env="haxball_env",
                         config={
                             "multiagent": {
                                 "policies":
                                 policies,
                                 "policy_mapping_fn":
                                 policy_mapping_fn,
                                 "policies_to_train":
                                 ["ppo_policy_atk", "ppo_policy_def"],
                             },
                             "timesteps_per_iteration": 7550,
                         })

for i in range(20):
    print("== Iteration", i, "==")

    # improve the PPO policy
    print("-- PPO --")
    print(pretty_print(ppo_trainer.train()))

# Policy pour le joueur qui fait le kickoff, et une pour le joueur qui reçoit