Esempio n. 1
0
def train(env_name):
    ModelCatalog.register_custom_model("masked_actions_model",
                                       MaskedActionsCNN)
    model_config = {
        "custom_model": "masked_actions_model",
        "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]],
        "conv_activation": "elu",
        "fcnet_hiddens": [128],
        "fcnet_activation": "elu",
    }
    tune_config = {
        "num_workers": 24,
        "num_gpus": 1,
        "batch_mode": "complete_episodes",
        "model": model_config,
        "env": env_name,
        "lr": 0.001,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping,
        },
        "framework": "tf"
    }
    trainer = DQNTrainer(env=env_name, config=tune_config)
    for i in range(1000):
        print("== Iteration {}==".format(i))
        results = trainer.train()
        pretty_print(results)
        checkpoint = trainer.save()
        print("\nCheckpoint saved at {}\n".format(checkpoint))
 def test_model(self) -> Tuple[List[float], list]:
     ray.init(logging_level=logging.INFO, ignore_reinit_error=True)
     agent = DQNTrainer(self.config, env=custom_env_name)
     weights = torch.load(
         self.params.model_dir / "trained_model.pt",
         map_location=lambda storage, loc: storage,
     )
     agent.set_weights({"default_policy": weights})
     rewards = []
     longest_screens = []
     for i in range(self.params.num_testing_episodes):
         screens = []
         try:
             logger.info("Iteration: {}", i)
             state = self.env.reset()
             done = False
             cumulative_reward = 0
             while not done:
                 action = agent.compute_action(state)
                 state, reward, done, _ = self.env.step(action)
                 screen = self.env.render(mode="rgb_array")
                 screens.append(screen)
                 cumulative_reward += reward
                 time.sleep(0.01)
             logger.info("Iteration: {}, Reward: {}", i, cumulative_reward)
             rewards.append(cumulative_reward)
         except KeyboardInterrupt:
             logger.info("Testing was interrupted")
             break
         if len(screens) > len(longest_screens):
             longest_screens = screens
     self.env.close()
     ray.shutdown()
     return rewards, longest_screens
def dqn_train(config, reporter):
    # Instantiate a trainer
    cfg = {
        # Max num timesteps for annealing schedules. Exploration is annealed from
        # 1.0 to exploration_fraction over this number of timesteps scaled by
        # exploration_fraction
        "schedule_max_timesteps": 1000000,
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": 1000,
        # Fraction of entire training period over which the exploration rate is
        # annealed
        "exploration_fraction": 0.1,
        # Final value of random action probability
        "exploration_final_eps": 0.02,
        "n_step": 3,
        "buffer_size": 500000,
        # "sample_batch_size"         : 32,
        # "train_batch_size"          : 128,
        # "learning_starts"           : 5000,
        # "target_network_update_freq": 5000,
        # "num_workers"               : NUM_WORKERS,
        # "per_worker_exploration"    : True,
        # "worker_side_prioritization": True,
        # "min_iter_time_s"           : 1,
    }
    trainer = DQNTrainer(config={**config, **cfg})

    while True:
        result = trainer.train()  # Executes one training step
        print(pretty_print(result))
        reporter(**result)  # notifies TrialRunner
Esempio n. 4
0
    def test_reproducing_trajectory(self):
        class PickLargest(gym.Env):
            def __init__(self):
                self.observation_space = gym.spaces.Box(low=float("-inf"),
                                                        high=float("inf"),
                                                        shape=(4, ))
                self.action_space = gym.spaces.Discrete(4)

            def reset(self, **kwargs):
                self.obs = np.random.randn(4)
                return self.obs

            def step(self, action):
                reward = self.obs[action]
                return self.obs, reward, True, {}

        def env_creator(env_config):
            return PickLargest()

        for fw in framework_iterator(frameworks=("tf", "torch")):
            trajs = list()
            for trial in range(3):
                ray.init()
                register_env("PickLargest", env_creator)
                config = {
                    "seed": 666 if trial in [0, 1] else 999,
                    "min_time_s_per_reporting": 0,
                    "timesteps_per_iteration": 100,
                    "framework": fw,
                }
                agent = DQNTrainer(config=config, env="PickLargest")

                trajectory = list()
                for _ in range(8):
                    r = agent.train()
                    trajectory.append(r["episode_reward_max"])
                    trajectory.append(r["episode_reward_min"])
                trajs.append(trajectory)

                ray.shutdown()

            # trial0 and trial1 use same seed and thus
            # expect identical trajectories.
            all_same = True
            for v0, v1 in zip(trajs[0], trajs[1]):
                if v0 != v1:
                    all_same = False
            self.assertTrue(all_same)

            # trial1 and trial2 use different seeds and thus
            # most rewards tend to be different.
            diff_cnt = 0
            for v1, v2 in zip(trajs[1], trajs[2]):
                if v1 != v2:
                    diff_cnt += 1
            self.assertTrue(diff_cnt > 8)
Esempio n. 5
0
 def testTrainCartpoleOffPolicy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"),
                                                 off_pol_frac=0.2))
     dqn = DQNTrainer(env="test3", config={"exploration_fraction": 0.001})
     for i in range(100):
         result = dqn.train()
         print("Iteration {}, reward {}, timesteps {}".format(
             i, result["episode_reward_mean"], result["timesteps_total"]))
         if result["episode_reward_mean"] >= 100:
             return
     raise Exception("failed to improve reward")
Esempio n. 6
0
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRaySinglePlayerEnvironment(board_size=13,
                                              num_players=4,
                                              agent=agent)

    env = environment_creater()
    tune.register_env("tron_single_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED                                              f

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    trainer = DQNTrainer(config, "tron_single_player")
    #trainer = PPOTrainer(config, "tron_single_player")

    trainer.restore("./dqn_checkpoint_3800/checkpoint-3800")

    return trainer  #.get_policy("trainer")
Esempio n. 7
0
 def testEvaluationOption(self):
     ray.init()
     agent = DQNTrainer(env="CartPole-v0",
                        config={"evaluation_interval": 2})
     r0 = agent.train()
     r1 = agent.train()
     r2 = agent.train()
     r3 = agent.train()
     r4 = agent.train()
     self.assertTrue("evaluation" in r0)
     self.assertTrue("episode_reward_mean" in r0["evaluation"])
     self.assertEqual(r0["evaluation"], r1["evaluation"])
     self.assertNotEqual(r1["evaluation"], r2["evaluation"])
     self.assertEqual(r2["evaluation"], r3["evaluation"])
     self.assertNotEqual(r3["evaluation"], r4["evaluation"])
def save_best_response_checkpoint(trainer: DQNTrainer,
                                  player: int,
                                  save_dir: str,
                                  timesteps_training_br: int,
                                  episodes_training_br: int,
                                  active_policy_num: int = None):
    policy_name = active_policy_num if active_policy_num is not None else "unclaimed"
    date_time = datetime_str()
    checkpoint_name = f"policy_{policy_name}_{date_time}.h5"
    checkpoint_path = os.path.join(save_dir, checkpoint_name)
    br_weights = trainer.get_weights([f"best_response"])["best_response"]
    br_weights = {k.replace(".", "_dot_"): v
                  for k, v in br_weights.items()
                  }  # periods cause HDF5 NaturalNaming warnings
    ensure_dir(file_path=checkpoint_path)
    num_save_attempts = 5
    for attempt in range(num_save_attempts):
        try:
            deepdish.io.save(path=checkpoint_path,
                             data={
                                 "weights": br_weights,
                                 "player": player,
                                 "policy_num": active_policy_num,
                                 "date_time_str": date_time,
                                 "seconds_since_epoch": time.time(),
                                 "timesteps_training_br":
                                 timesteps_training_br,
                                 "episodes_training_br": episodes_training_br
                             })
            break
        except HDF5ExtError:
            if attempt + 1 == num_save_attempts:
                raise
            time.sleep(1.0)
    return checkpoint_path
Esempio n. 9
0
def dqn_trainer_wt_amtft_policies_in_ipd(
    logger_creator,
):
    train_n_replicates = 1
    debug = True
    hparams = get_hyperparameters(
        debug,
        train_n_replicates,
        filter_utilitarian=False,
        env="IteratedPrisonersDilemma",
    )

    _, _, rllib_config = get_rllib_config(
        hparams, welfare_fn=postprocessing.WELFARE_UTILITARIAN
    )

    rllib_config["env"] = IteratedPrisonersDilemma
    rllib_config["seed"] = int(time.time())

    policies = rllib_config["multiagent"]["policies"]
    for policy_id, policy_tuple in policies.items():
        policy_list = list(policy_tuple)
        policy_list[0] = amTFT.AmTFTRolloutsTorchPolicy
        policies[policy_id] = policy_list

    dqn_trainer = DQNTrainer(rllib_config, logger_creator=logger_creator)
    return dqn_trainer
def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # DiscreteWrapper just converts wheel velocities to high level discrete actions.
        return DiscreteWrapper(
            simulator.Simulator(
                map_name=args.map,
                max_steps=2000,
            ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = DQNTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')
Esempio n. 11
0
    def train(config, checkpoint_dir=None):
        trainer = DQNTrainer(config=config, env='BomberMan-v0')
        # trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770')
        iter = 0

        # def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 250 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")
Esempio n. 12
0
def train(num_iters, checkpoint_freq):
    obs_space = spaces.Dict({
        'obs':
        spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32),
        'action_mask':
        spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32)
    })
    act_space = spaces.Discrete(n=5)

    trainer = DQNTrainer(
        env='SUMOEnv-v0',
        config={
            'model': {
                'custom_model': 'adaptive-trafficlight',
                'custom_options': {},
            },
            'multiagent': {
                'policy_graphs': {
                    'default_policy_graph': (
                        DQNPolicyGraph,
                        obs_space,
                        act_space,
                        {},
                    ),
                },
                'policy_mapping_fn':
                function(lambda _: 'default_policy_graph'),
            },
            'hiddens': [],  # Don't postprocess the action scores
            'callbacks': {
                'on_episode_end': function(on_episode_end),
            },
            # 'num_workers': 4,
            # 'num_gpus_per_worker': 0.25,  # All workers on a single GPU
            'timesteps_per_iteration': 20000,
        })

    for i in range(num_iters):
        print(f'== Iteration {i}==')
        print(pretty_print(trainer.train()))

        if i % checkpoint_freq == 0:
            checkpoint = trainer.save()
            print(f'\nCheckpoint saved at {checkpoint}\n')
Esempio n. 13
0
 def test_policy_save_restore(self):
     config = DEFAULT_CONFIG.copy()
     for _ in framework_iterator(config):
         trainer = DQNTrainer(config=config, env="CartPole-v0")
         policy = trainer.get_policy()
         state1 = policy.get_state()
         trainer.train()
         state2 = policy.get_state()
         check(state1["_exploration_state"]["last_timestep"],
               state2["_exploration_state"]["last_timestep"],
               false=True)
         check(state1["global_timestep"],
               state2["global_timestep"],
               false=True)
         # Reset policy to its original state and compare.
         policy.set_state(state1)
         state3 = policy.get_state()
         # Make sure everything is the same.
         check(state1, state3)
def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap",
                 lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            "learning_starts": 500,
            # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report.
            # "record_env": True,
            "train_batch_size": 16,
            # Use a very small buffer to reduce memory usage, default: 50_000.
            "buffer_size": 1000,
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
            # Don't save experiences.
            # "output": None,
            # "compress_observations": True,
            "num_workers": 0,
            "num_gpus": 0.5,
            "rollout_fragment_length": 50,
        })

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('dqn_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(
            f'----------------------- Starting epoch {i} ----------------------- '
        )
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('DQN DuckieTown-MultiMap')
Esempio n. 15
0
def ray_server(run='PPO', address=ADDRESS, port=PORT):
    print(ray.init(log_to_driver=False))

    connector_config = {
        "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)),
        "num_workers": 0,
        "input_evaluation": [],
        "create_env_on_driver": False,
        "num_gpus": FLAGS.num_gpus,
    }

    if run == "DQN":
        trainer = DQNTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_DQN))
    elif run == "PPO":
        trainer = PPOTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_PPO))
    else:
        raise ValueError("--run must be DQN or PPO")

    i = 0
    while i < FLAGS.iter:
        i += 1
        print(pretty_print(trainer.train()))
    ray.shutdown()

    checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/')))
    print("checkpoint saved at", checkpoint)
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "config.json"),
        os.path.join(FLAGS.train_url, "config.json"))
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "customize_service.py"),
        os.path.join(FLAGS.train_url, "customize_service.py"))
    mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"),
                  os.path.join(FLAGS.train_url, "rl_config.py"))

    del trainer
 def test_train_cartpole_off_policy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"),
                                                 off_pol_frac=0.2))
     config = {
         "num_workers": 0,
         "exploration_config": {
             "epsilon_timesteps": 100
         },
     }
     for _ in framework_iterator(config, frameworks=("tf", "torch")):
         dqn = DQNTrainer(env="test3", config=config)
         reached = False
         for i in range(50):
             result = dqn.train()
             print("Iteration {}, reward {}, timesteps {}".format(
                 i, result["episode_reward_mean"],
                 result["timesteps_total"]))
             if result["episode_reward_mean"] >= 80:
                 reached = True
                 break
         if not reached:
             raise Exception("failed to improve reward")
Esempio n. 17
0
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config', type=str, default='config/global_config.json', help='config file')
    parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference', action="store_true", help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch', type=int, default=100, help='number of training epochs')
    parser.add_argument('--num_step', type=int, default=10 ** 3,
                        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency')
    parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency')
    parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state')
    parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)
    
    trainer = DQNTrainer(
        env=CityflowGymEnv,
        config=config_agent)
    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if (i+1) % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Esempio n. 18
0
class DQNrl(object):
    def __init__(self, env, env_config, config):
        self.config = config
        self.config['env_config'] = env_config
        self.env = env(env_config)
        self.agent = DQNTrainer(config=self.config, env=env)

    def fit(self, checkpoint=None):
        if checkpoint is None:
            checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl')
        for idx in trange(5):
            result = self.agent.train()
            LOGGER.warning('result: ', result)
            if (idx + 1) % 5 == 0:
                LOGGER.warning('Save checkpoint at: {}'.format(idx + 1))
                state = self.agent.save_to_object()
                with open(checkpoint, 'wb') as fp:
                    pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL)
        return result

    def predict(self, checkpoint=None):
        if checkpoint is not None:
            with open(checkpoint, 'rb') as fp:
                state = pickle.load(fp)
            self.agent.restore_from_object(state)
        done = False
        episode_reward = 0
        obs = self.env.reset()
        actions = []
        while not done:
            action = self.agent.compute_action(obs)
            actions.append(action)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward
        results = {'action': actions, 'reward': episode_reward}
        return results
def train_model(args, config):
    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config=config,
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    best_mean_reward = -np.inf
    epoch_of_best_mean_reward = 0
    path_of_best_mean_reward = None

    for i in trange(args.epochs, desc="Epochs",
                    leave=False):  # Number of episodes (basically epochs)
        # print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        # print(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        if result["episode_reward_mean"] > best_mean_reward:
            best_mean_reward = result["episode_reward_mean"]
            epoch_of_best_mean_reward = i
            path_of_best_mean_reward = checkpoint_path

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))

    return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
Esempio n. 20
0
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            "explore": False,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
            "framework": "torch" if args.torch else "tf",
        })

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "gamma": 0.95,
            "n_step": 3,
            "framework": "torch" if args.torch or args.mixed_torch_tf else "tf"
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
Esempio n. 21
0
        # Use a single worker process to run the server.
        "num_workers":
        0,
        # Disable OPE, since the rollouts are coming from online clients.
        "input_evaluation": [],
    }

    if args.run == "DQN":
        # Example of using DQN (supports off-policy actions).
        trainer = DQNTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "exploration_config": {
                                         "type": "EpsilonGreedy",
                                         "initial_epsilon": 1.0,
                                         "final_epsilon": 0.02,
                                         "epsilon_timesteps": 1000,
                                     },
                                     "learning_starts": 100,
                                     "timesteps_per_iteration": 200,
                                     "log_level": "INFO",
                                     "framework": args.framework,
                                 }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "sample_batch_size": 1000,
                                     "train_batch_size": 4000,
                                     "framework": args.framework,
                                 }))
Esempio n. 22
0
def train(config, reporter):
    trainer = DQNTrainer(config=config, env=Coach)
    for _ in range(11):
        print(_)
        trainer.train()
Esempio n. 23
0
_____________________________________________________________________
value_out (Dense)          (None, 1)     257      fc_value_2[0][0]
=====================================================================
Total params: 134,915
Trainable params: 134,915
Non-trainable params: 0
_____________________________________________________________________
"""
# __query_action_dist_end__

# __get_q_values_dqn_start__
# Get a reference to the model through the policy
import numpy as np
from ray.rllib.agents.dqn import DQNTrainer

trainer = DQNTrainer(env="CartPole-v0", config={"framework": "tf2"})
model = trainer.get_policy().model
# <ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQModel ...>

# List of all model variables
model.variables()

# Run a forward pass to get base model output. Note that complex observations
# must be preprocessed. An example of preprocessing is examples/saving_experiences.py
model_out = model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})
# (<tf.Tensor: id=832, shape=(1, 256), dtype=float32, numpy=...)

# Access the base Keras models (all default models have a base)
model.base_model.summary()
"""
Model: "model"
Esempio n. 24
0
            # as well to the DQN agent
            "observation_filter": "MeanStdFilter",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": args.framework,
        })

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "model": {
                "vf_share_layers": True,
            },
            "gamma": 0.95,
            "n_step": 3,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": args.framework,
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
Esempio n. 25
0
def main():

    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    ### dw ###
    #parser.add_argument("--num-agents", type=int, default=6)

    model_dir = "model/{}_{}".format(args.algo, date)
    result_dir = "result/{}_{}".format(args.algo, date)

    config_env = env_config(args)

    num_agents = len(config_env["intersection_id"])
    '''
    obs_space = Tuple([
        CityFlowEnvRay.observation_space for _ in range(num_agents)
    ])
    act_space = Tuple([
        CityFlowEnvRay.action_space for _ in range(num_agents)
    ])
    '''

    ### dw ###
    obs_space = CityFlowEnvRay.observation_space
    act_space = CityFlowEnvRay.action_space

    ray.tune.register_env('gym_cityflow',
                          lambda env_config: CityFlowEnvRay(env_config))

    #config_agent = agent_config(config_env)

    # # build cityflow environment
    '''
    trainer = DQNTrainer(
        env=CityFlowEnvRay,
        config=config_agent)
    '''

    policies = {
        #"dqn_policy":(None, obs_space, act_space, config_env)
        #"policy_{}".format(i): (None, obs_space, act_space, config_env)
        "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {})
        for i in range(num_agents)
    }
    policy_ids = list(policies.keys())

    config_agent = agent_config(config_env, policies, policy_ids)

    trainer = DQNTrainer(env='gym_cityflow', config=config_agent)

    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
                                                       SERVER_PORT))
        server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
        server.serve_forever()


if __name__ == "__main__":
    ray.init()
    register_env("ECglass-v2", lambda _: ECglassServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    dqn = DQNTrainer(
        env="ECglass-v2",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # Configure the agent to run short iterations for debugging
            "exploration_fraction": 0.01,
            "learning_starts": 100,
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
Esempio n. 27
0
    def run_dqn(self, config):
        # RAY tmp
        temp_dir_full_path_obj = Path(self.ray_temp_dir).resolve()
        temp_dir_full_path_obj.mkdir(parents=True, exist_ok=True)
        temp_dir_full_path = str(temp_dir_full_path_obj)
        # Result paths
        result_dir_path_root = Path(self.run_result_dir).resolve()
        # Separate MDDE output and Ray output
        result_dir_path_ray_obj = result_dir_path_root.joinpath("ray")
        result_dir_path_ray_obj.mkdir(parents=True, exist_ok=True)
        result_dir_path_ray = str(result_dir_path_ray_obj)
        result_dir_path_mdde_obj = result_dir_path_root.joinpath("mdde")
        result_dir_path_mdde_obj.mkdir(parents=True, exist_ok=True)
        result_dir_path_mdde = str(result_dir_path_mdde_obj)
        # Config
        config_file_full_path = str(Path(self.mdde_registry_config).resolve())
        # MDDE tmp
        temp_env_dir = self.env_temp_dir

        os.makedirs(os.path.abspath(temp_env_dir), exist_ok=True)

        ray.init(
            num_gpus=0,
            num_cpus=4,
            #temp_dir=temp_dir_full_path
        )

        mdde_config = ConfigEnvironment(tmp_dir=temp_env_dir,
                                        result_dir=result_dir_path_mdde)

        def make_env(host: str,
                     port: int,
                     reg_config: str,
                     env_config: ConfigEnvironment,
                     write_stats: bool,
                     initial_benchmark: bool = False,
                     do_nothing: bool = True) -> Environment:
            """
            Configure MDDE environment to run default.
            :param host: MDDE registry host or IP.
            :param port: MDDE registry control port.
            :param reg_config: Path to MDDE registry config.
            :param env_config: Environment configuration object.
            :param write_stats: True to write additional analytics info.
            :param initial_benchmark: Execute benchmark immediately upon execution.
            :param do_nothing: Enable or disable the agents' "do_nothing" action.
            :return: MDDE Environment.
            """

            # Ray is peculiar in the way it handles environments, passing a pre-configured environment might cause
            # unexpected behavior. Customize the code of this extension if more complex environment are needed

            # Create Registry client
            tcp_client = RegistryClientTCP(host, port)
            read_client: PRegistryReadClient = tcp_client
            write_client: PRegistryWriteClient = tcp_client
            ctrl_client: PRegistryControlClient = tcp_client

            # Registry configuration
            config_container = ConfigRegistry()
            config_container.read(reg_config)

            # Create agents
            agents = list()
            idx = 0
            for node in config_container.get_nodes():
                agents.append(
                    SingleNodeDefaultAgent(agent_name=node.id,
                                           agent_id=idx,
                                           data_node_id=node.id,
                                           write_stats=write_stats,
                                           allow_do_nothing=do_nothing))
                idx += 1

            # Create scenario
            scenario = DefaultScenario(
                num_fragments=20,
                num_steps_before_bench=config.bench_psteps,
                agents=agents,
                benchmark_clients=config.bench_clients,
                write_stats=write_stats)  # Number of YCSB threads

            # Create environment
            environment = Environment(config=env_config,
                                      scenario=scenario,
                                      registry_ctrl=ctrl_client,
                                      registry_write=write_client,
                                      registry_read=read_client,
                                      write_stats=write_stats)
            # Re-generate data
            environment.initialize_registry(with_benchmark=initial_benchmark)

            return environment

        def obs_shaper_2d_box(obs):
            """Reshapes the environment into a form suitable for 2D box. Example 1.
            Note: Guaranteed to work only with the Default agent - Default scenario combination."""
            # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments):
            # a_1: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)]
            # a_2: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)]
            # Hint: 2D array where rows are agents, and attributes in columns are as shown above.
            return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]),
                               order='F')

        def obs_shaper_flat_box(obs):
            """Reshapes the environment into a form suitable for 2D 'flat' box. Example 2.
            Note: Guaranteed to work only with the Default agent - Default scenario combination."""
            # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments):
            # [0-4(a_1: allocation) 5-9(a_1: popularity) 10-14(a_1: ownership binary flag)
            #  15-19(a_2: allocation) 20-24(a_2: popularity) 25-29(a_2: ownership binary flag)]
            return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') \
                .reshape((obs.shape[0] * obs.shape[1] * obs.shape[2]), order='C')

        sample_selected_shaper = obs_shaper_flat_box
        """Observation shaper selected. Set None if you want to use the default one in the wrapper."""

        # Create and initialize environment before passing it to Ray
        # This makes it impossible to run multiple instances of the environment, however it's intentional due to the
        # the nature of the environment that's represented as a distributed infrastructure of services, it can't be
        # easily created and destroyed as a simple local game-like environment
        env_instance = MddeMultiAgentEnv(
            env=make_env(host=self.mdde_registry_host,
                         port=self.mdde_registry_port,
                         reg_config=config_file_full_path,
                         env_config=mdde_config,
                         write_stats=False,
                         initial_benchmark=False,
                         do_nothing=config.do_nothing),
            observation_shaper=sample_selected_shaper)

        def env_creator(kvargs):
            env = make_env(**kvargs)
            return MddeMultiAgentEnv(env=env,
                                     observation_shaper=sample_selected_shaper)

        register_env("mdde", env_creator)

        # generate policies based on the created environment instance
        def gen_policy(i):
            return (None, env_instance.observation_space_dict[i],
                    env_instance.action_space_dict[i], {
                        "agent_id": i,
                        "obs_space_dict":
                        env_instance.observation_space_dict[i],
                        "act_space_dict": env_instance.action_space_dict[i],
                    })

        policies = {
            "policy_%d" % i: gen_policy(i)
            for i in env_instance.action_space_dict.keys()
        }
        policy_ids = list(policies.keys())

        def policy_mapping_fn(agent_id):
            return policy_ids[agent_id]

        exp_name = "DQN_MDDE_DEBUG"
        exp_config = {
            # === Log ===
            "log_level": "ERROR",

            # === Environment ===
            "env_config": {
                "host": self.mdde_registry_host,
                "port": self.mdde_registry_port,
                "reg_config": config_file_full_path,
                "env_config": mdde_config,
                "write_stats": True,
                "do_nothing": config.do_nothing
            },
            "num_envs_per_worker": 1,
            "horizon": config.ep_len,

            # === Policy Config ===
            # --- Model ---
            "n_step": 1,
            #"gamma": config.gamma,

            # --- Replay buffer ---
            "buffer_size": config.buffer_size,

            # --- Optimization ---
            "lr": config.lr,
            "learning_starts": config.learning_starts,
            "train_batch_size": self.TRAIN_BATCH_SIZE,
            "batch_mode": "truncate_episodes",

            # --- Parallelism ---
            "num_workers": 0,
            "num_gpus": 0,
            "num_gpus_per_worker": 0,

            # === Multi-agent setting ===
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": ray.tune.function(policy_mapping_fn)
            },
        }

        if config.debug:  # Run DQN within the same process (useful for debugging)
            dqn_trainer = DQNTrainer(env="mdde", config=exp_config)
            for step in range(0, config.num_episodes * config.ep_len):
                dqn_trainer.train()
        else:
            trainer = DQNTrainer
            run_experiments(
                {
                    exp_name: {
                        "run": trainer,
                        "env": "mdde",
                        "stop": {
                            "episodes_total": config.num_episodes,
                        },
                        "checkpoint_freq": 0,
                        "local_dir": result_dir_path_ray,
                        "restore": False,
                        "config": exp_config
                    },
                },
                verbose=0,
                reuse_actors=False
            )  # reuse_actors=True - messes up the results
Esempio n. 28
0
        # Use the connector server to generate experiences.
        "input": (
            lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT)
        ),
        # Use a single worker process to run the server.
        "num_workers": 0,
        # Disable OPE, since the rollouts are coming from online clients.
        "input_evaluation": [],
    }

    if args.run == "DQN":
        # Example of using DQN (supports off-policy actions).
        trainer = DQNTrainer(
            env=env,
            config=dict(
                connector_config, **{
                    "learning_starts": 100,
                    "timesteps_per_iteration": 200,
                    "framework": args.framework,
                }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(
            env=env,
            config=dict(
                connector_config, **{
                    "rollout_fragment_length": 1000,
                    "train_batch_size": 4000,
                    "framework": args.framework,
                }))
    else:
        raise ValueError("--run must be DQN or PPO")
Esempio n. 29
0
    policy.q_loss.stats.update({"q_loss": policy.q_loss.loss})

    loss = policy.q_model.extra_loss(policy.q_loss.loss, train_batch,
                                     policy.q_loss.stats)

    return loss


def _compute_q_values(policy, model, obs, obs_space, action_space):
    model({
        "obs": obs,
        "is_training": policy._get_is_training_placeholder(),
    }, [], None)

    q_out = model.get_q_out()

    return q_out["value"], q_out["logits"], q_out["dist"]


LegalActionDQNPolicy = DQNTFPolicy.with_updates(
    name="LegalActionDQNPolicy",
    action_sampler_fn=build_q_networks,
    loss_fn=build_q_losses)

LegalActionDQNTrainer = DQNTrainer.with_updates(
    name="LegalActionDQN", default_policy=LegalActionDQNPolicy)

LegalActionApexTrainer = LegalActionDQNTrainer.with_updates(
    name="LegalActionAPEX",
    default_config=APEX_DEFAULT_CONFIG,
    **APEX_TRAINER_PROPERTIES)
            "explore": False,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        })

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "gamma": 0.95,
            "n_step": 3,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch or args.mixed_torch_tf else "tf"
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")