Esempio n. 1
0
 def testEvaluationOption(self):
     ray.init()
     agent = DQNTrainer(env="CartPole-v0",
                        config={"evaluation_interval": 2})
     r0 = agent.train()
     r1 = agent.train()
     r2 = agent.train()
     r3 = agent.train()
     r4 = agent.train()
     self.assertTrue("evaluation" in r0)
     self.assertTrue("episode_reward_mean" in r0["evaluation"])
     self.assertEqual(r0["evaluation"], r1["evaluation"])
     self.assertNotEqual(r1["evaluation"], r2["evaluation"])
     self.assertEqual(r2["evaluation"], r3["evaluation"])
     self.assertNotEqual(r3["evaluation"], r4["evaluation"])
Esempio n. 2
0
def train(env_name):
    ModelCatalog.register_custom_model("masked_actions_model",
                                       MaskedActionsCNN)
    model_config = {
        "custom_model": "masked_actions_model",
        "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]],
        "conv_activation": "elu",
        "fcnet_hiddens": [128],
        "fcnet_activation": "elu",
    }
    tune_config = {
        "num_workers": 24,
        "num_gpus": 1,
        "batch_mode": "complete_episodes",
        "model": model_config,
        "env": env_name,
        "lr": 0.001,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping,
        },
        "framework": "tf"
    }
    trainer = DQNTrainer(env=env_name, config=tune_config)
    for i in range(1000):
        print("== Iteration {}==".format(i))
        results = trainer.train()
        pretty_print(results)
        checkpoint = trainer.save()
        print("\nCheckpoint saved at {}\n".format(checkpoint))
def dqn_train(config, reporter):
    # Instantiate a trainer
    cfg = {
        # Max num timesteps for annealing schedules. Exploration is annealed from
        # 1.0 to exploration_fraction over this number of timesteps scaled by
        # exploration_fraction
        "schedule_max_timesteps": 1000000,
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": 1000,
        # Fraction of entire training period over which the exploration rate is
        # annealed
        "exploration_fraction": 0.1,
        # Final value of random action probability
        "exploration_final_eps": 0.02,
        "n_step": 3,
        "buffer_size": 500000,
        # "sample_batch_size"         : 32,
        # "train_batch_size"          : 128,
        # "learning_starts"           : 5000,
        # "target_network_update_freq": 5000,
        # "num_workers"               : NUM_WORKERS,
        # "per_worker_exploration"    : True,
        # "worker_side_prioritization": True,
        # "min_iter_time_s"           : 1,
    }
    trainer = DQNTrainer(config={**config, **cfg})

    while True:
        result = trainer.train()  # Executes one training step
        print(pretty_print(result))
        reporter(**result)  # notifies TrialRunner
def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap",
                 lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            "learning_starts": 500,
            # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report.
            # "record_env": True,
            "train_batch_size": 16,
            # Use a very small buffer to reduce memory usage, default: 50_000.
            "buffer_size": 1000,
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
            # Don't save experiences.
            # "output": None,
            # "compress_observations": True,
            "num_workers": 0,
            "num_gpus": 0.5,
            "rollout_fragment_length": 50,
        })

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('dqn_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(
            f'----------------------- Starting epoch {i} ----------------------- '
        )
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('DQN DuckieTown-MultiMap')
Esempio n. 5
0
    def test_reproducing_trajectory(self):
        class PickLargest(gym.Env):
            def __init__(self):
                self.observation_space = gym.spaces.Box(low=float("-inf"),
                                                        high=float("inf"),
                                                        shape=(4, ))
                self.action_space = gym.spaces.Discrete(4)

            def reset(self, **kwargs):
                self.obs = np.random.randn(4)
                return self.obs

            def step(self, action):
                reward = self.obs[action]
                return self.obs, reward, True, {}

        def env_creator(env_config):
            return PickLargest()

        for fw in framework_iterator(frameworks=("tf", "torch")):
            trajs = list()
            for trial in range(3):
                ray.init()
                register_env("PickLargest", env_creator)
                config = {
                    "seed": 666 if trial in [0, 1] else 999,
                    "min_time_s_per_reporting": 0,
                    "timesteps_per_iteration": 100,
                    "framework": fw,
                }
                agent = DQNTrainer(config=config, env="PickLargest")

                trajectory = list()
                for _ in range(8):
                    r = agent.train()
                    trajectory.append(r["episode_reward_max"])
                    trajectory.append(r["episode_reward_min"])
                trajs.append(trajectory)

                ray.shutdown()

            # trial0 and trial1 use same seed and thus
            # expect identical trajectories.
            all_same = True
            for v0, v1 in zip(trajs[0], trajs[1]):
                if v0 != v1:
                    all_same = False
            self.assertTrue(all_same)

            # trial1 and trial2 use different seeds and thus
            # most rewards tend to be different.
            diff_cnt = 0
            for v1, v2 in zip(trajs[1], trajs[2]):
                if v1 != v2:
                    diff_cnt += 1
            self.assertTrue(diff_cnt > 8)
Esempio n. 6
0
 def test_policy_save_restore(self):
     config = DEFAULT_CONFIG.copy()
     for _ in framework_iterator(config):
         trainer = DQNTrainer(config=config, env="CartPole-v0")
         policy = trainer.get_policy()
         state1 = policy.get_state()
         trainer.train()
         state2 = policy.get_state()
         check(state1["_exploration_state"]["last_timestep"],
               state2["_exploration_state"]["last_timestep"],
               false=True)
         check(state1["global_timestep"],
               state2["global_timestep"],
               false=True)
         # Reset policy to its original state and compare.
         policy.set_state(state1)
         state3 = policy.get_state()
         # Make sure everything is the same.
         check(state1, state3)
Esempio n. 7
0
 def testTrainCartpoleOffPolicy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"),
                                                 off_pol_frac=0.2))
     dqn = DQNTrainer(env="test3", config={"exploration_fraction": 0.001})
     for i in range(100):
         result = dqn.train()
         print("Iteration {}, reward {}, timesteps {}".format(
             i, result["episode_reward_mean"], result["timesteps_total"]))
         if result["episode_reward_mean"] >= 100:
             return
     raise Exception("failed to improve reward")
Esempio n. 8
0
    def train(config, checkpoint_dir=None):
        trainer = DQNTrainer(config=config, env='BomberMan-v0')
        # trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770')
        iter = 0

        # def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 250 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")
Esempio n. 9
0
def train(num_iters, checkpoint_freq):
    obs_space = spaces.Dict({
        'obs':
        spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32),
        'action_mask':
        spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32)
    })
    act_space = spaces.Discrete(n=5)

    trainer = DQNTrainer(
        env='SUMOEnv-v0',
        config={
            'model': {
                'custom_model': 'adaptive-trafficlight',
                'custom_options': {},
            },
            'multiagent': {
                'policy_graphs': {
                    'default_policy_graph': (
                        DQNPolicyGraph,
                        obs_space,
                        act_space,
                        {},
                    ),
                },
                'policy_mapping_fn':
                function(lambda _: 'default_policy_graph'),
            },
            'hiddens': [],  # Don't postprocess the action scores
            'callbacks': {
                'on_episode_end': function(on_episode_end),
            },
            # 'num_workers': 4,
            # 'num_gpus_per_worker': 0.25,  # All workers on a single GPU
            'timesteps_per_iteration': 20000,
        })

    for i in range(num_iters):
        print(f'== Iteration {i}==')
        print(pretty_print(trainer.train()))

        if i % checkpoint_freq == 0:
            checkpoint = trainer.save()
            print(f'\nCheckpoint saved at {checkpoint}\n')
def train_model(args, config):
    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config=config,
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    best_mean_reward = -np.inf
    epoch_of_best_mean_reward = 0
    path_of_best_mean_reward = None

    for i in trange(args.epochs, desc="Epochs",
                    leave=False):  # Number of episodes (basically epochs)
        # print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        # print(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        if result["episode_reward_mean"] > best_mean_reward:
            best_mean_reward = result["episode_reward_mean"]
            epoch_of_best_mean_reward = i
            path_of_best_mean_reward = checkpoint_path

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))

    return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
Esempio n. 11
0
def ray_server(run='PPO', address=ADDRESS, port=PORT):
    print(ray.init(log_to_driver=False))

    connector_config = {
        "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)),
        "num_workers": 0,
        "input_evaluation": [],
        "create_env_on_driver": False,
        "num_gpus": FLAGS.num_gpus,
    }

    if run == "DQN":
        trainer = DQNTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_DQN))
    elif run == "PPO":
        trainer = PPOTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_PPO))
    else:
        raise ValueError("--run must be DQN or PPO")

    i = 0
    while i < FLAGS.iter:
        i += 1
        print(pretty_print(trainer.train()))
    ray.shutdown()

    checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/')))
    print("checkpoint saved at", checkpoint)
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "config.json"),
        os.path.join(FLAGS.train_url, "config.json"))
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "customize_service.py"),
        os.path.join(FLAGS.train_url, "customize_service.py"))
    mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"),
                  os.path.join(FLAGS.train_url, "rl_config.py"))

    del trainer
 def test_train_cartpole_off_policy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"),
                                                 off_pol_frac=0.2))
     config = {
         "num_workers": 0,
         "exploration_config": {
             "epsilon_timesteps": 100
         },
     }
     for _ in framework_iterator(config, frameworks=("tf", "torch")):
         dqn = DQNTrainer(env="test3", config=config)
         reached = False
         for i in range(50):
             result = dqn.train()
             print("Iteration {}, reward {}, timesteps {}".format(
                 i, result["episode_reward_mean"],
                 result["timesteps_total"]))
             if result["episode_reward_mean"] >= 80:
                 reached = True
                 break
         if not reached:
             raise Exception("failed to improve reward")
Esempio n. 13
0
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config', type=str, default='config/global_config.json', help='config file')
    parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference', action="store_true", help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch', type=int, default=100, help='number of training epochs')
    parser.add_argument('--num_step', type=int, default=10 ** 3,
                        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency')
    parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency')
    parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state')
    parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)
    
    trainer = DQNTrainer(
        env=CityflowGymEnv,
        config=config_agent)
    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if (i+1) % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Esempio n. 14
0
class DQNrl(object):
    def __init__(self, env, env_config, config):
        self.config = config
        self.config['env_config'] = env_config
        self.env = env(env_config)
        self.agent = DQNTrainer(config=self.config, env=env)

    def fit(self, checkpoint=None):
        if checkpoint is None:
            checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl')
        for idx in trange(5):
            result = self.agent.train()
            LOGGER.warning('result: ', result)
            if (idx + 1) % 5 == 0:
                LOGGER.warning('Save checkpoint at: {}'.format(idx + 1))
                state = self.agent.save_to_object()
                with open(checkpoint, 'wb') as fp:
                    pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL)
        return result

    def predict(self, checkpoint=None):
        if checkpoint is not None:
            with open(checkpoint, 'rb') as fp:
                state = pickle.load(fp)
            self.agent.restore_from_object(state)
        done = False
        episode_reward = 0
        obs = self.env.reset()
        actions = []
        while not done:
            action = self.agent.compute_action(obs)
            actions.append(action)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward
        results = {'action': actions, 'reward': episode_reward}
        return results
Esempio n. 15
0
def train(config, reporter):
    trainer = DQNTrainer(config=config, env=Coach)
    for _ in range(11):
        print(_)
        trainer.train()
Esempio n. 16
0
    "buffer_size": 50000,
    "sample_batch_size": 4,
    "train_batch_size": 320,
    "schedule_max_timesteps": 2000000,
    "exploration_final_eps": 0.01,
    "exploration_fraction": 0.1,
    "model": {
        "dim": 64
    }
})


def env_creator(env_config):
    return PodWorldEnv(max_steps=100, reward_factor=1.0)


register_env("podworld_env", env_creator)
agent = DQNTrainer(config=config, env="podworld_env")
agent_save_path = None

for i in range(50):
    stats = agent.train()
    # print(pretty_print(stats))
    if i % 10 == 0 and i > 0:
        path = agent.save()
        if agent_save_path is None:
            agent_save_path = path
            print('Saved agent at', agent_save_path)
    logger.write((i, stats['episode_reward_min']))
    print('episode_reward_mean', stats['episode_reward_min'])
Esempio n. 17
0
def main():

    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    ### dw ###
    #parser.add_argument("--num-agents", type=int, default=6)

    model_dir = "model/{}_{}".format(args.algo, date)
    result_dir = "result/{}_{}".format(args.algo, date)

    config_env = env_config(args)

    num_agents = len(config_env["intersection_id"])
    '''
    obs_space = Tuple([
        CityFlowEnvRay.observation_space for _ in range(num_agents)
    ])
    act_space = Tuple([
        CityFlowEnvRay.action_space for _ in range(num_agents)
    ])
    '''

    ### dw ###
    obs_space = CityFlowEnvRay.observation_space
    act_space = CityFlowEnvRay.action_space

    ray.tune.register_env('gym_cityflow',
                          lambda env_config: CityFlowEnvRay(env_config))

    #config_agent = agent_config(config_env)

    # # build cityflow environment
    '''
    trainer = DQNTrainer(
        env=CityFlowEnvRay,
        config=config_agent)
    '''

    policies = {
        #"dqn_policy":(None, obs_space, act_space, config_env)
        #"policy_{}".format(i): (None, obs_space, act_space, config_env)
        "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {})
        for i in range(num_agents)
    }
    policy_ids = list(policies.keys())

    config_agent = agent_config(config_env, policies, policy_ids)

    trainer = DQNTrainer(env='gym_cityflow', config=config_agent)

    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
if __name__ == "__main__":
    ray.init()
    register_env("ECglass-v2", lambda _: ECglassServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    dqn = DQNTrainer(
        env="ECglass-v2",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # Configure the agent to run short iterations for debugging
            "exploration_fraction": 0.01,
            "learning_starts": 100,
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)
Esempio n. 19
0
dqn_config = {
    "v_min": -1.0,
    "v_max": 5.0,
    "hiddens": [128],
    "exploration_config": {
        "epsilon_timesteps": 4000,
    },
    'lr': 5e-5,
    "num_atoms": 2,
    "learning_starts": 100,
    "timesteps_per_iteration": 1200
}

if __name__ == "__main__":
    ray.init()

    register_env("LAIMKTEngine",
                 lambda _: LAIMKTEngine(MKTWorld(env_config), episodes=10000))
    dqn = DQNTrainer(env="LAIMKTEngine", config=dqn_config)

    i = 1
    while True:
        result = dqn.train()
        print(
            "Iteration {}, Episodes {}, Mean Reward {}, Mean Length {}".format(
                i, result['episodes_this_iter'], result['episode_reward_mean'],
                result['episode_len_mean']))
        i += 1

    ray.shutdown()
Esempio n. 20
0
    # Manual training loop (no Ray tune).
    if args.no_tune:
        if args.run == "DQN":
            trainer = DQNTrainer(config=config)
        else:
            trainer = PPOTrainer(config=config)

        if checkpoint_path:
            print("Restoring from checkpoint path", checkpoint_path)
            trainer.restore(checkpoint_path)

        # Serving and training loop.
        ts = 0
        for _ in range(args.stop_iters):
            results = trainer.train()
            print(pretty_print(results))
            checkpoint = trainer.save()
            print("Last checkpoint", checkpoint)
            with open(checkpoint_path, "w") as f:
                f.write(checkpoint)
            if results["episode_reward_mean"] >= args.stop_reward or \
                    ts >= args.stop_timesteps:
                break
            ts += results["timesteps_total"]

    # Run with Tune for auto env and trainer creation and TensorBoard.
    else:
        stop = {
            "training_iteration": args.stop_iters,
            "timesteps_total": args.stop_timesteps,
Esempio n. 21
0
from ray.rllib.agents.dqn import DQNTrainer, DQNTorchPolicy
from ray.rllib.agents.dqn.dqn_torch_model import DQNTorchModel

config = {
    'gamma': 0.9,
    'lr': 1e-2,
    'num_workers': 4,
    'train_batch_size': 1000,
    'model': {
        'fcnet_hiddens': [128, 128]
    }
}

trainer = DQNTrainer(env="LunarLander-v3",
                     config=config).with_updates(execution_plan=execution_plan)
results = trainer.train(
)  # once enough data is collected the model is updated and the results are returned

from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.agents.dqn.simple_q_tf_policy import SimpleQTFPolicy
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.concurrency_ops import Concurrently
from ray.rllib.execution.metric_ops import StandardMetricsReporting
from ray.rllib.execution.replay_buffer import LocalReplayBuffer
from ray.rllib.execution.replay_ops import Replay, StoreToReplayBuffer
from ray.rllib.execution.rollout_ops import ParallelRollouts
from ray.rllib.execution.train_ops import TrainOneStep, UpdateTargetNetwork
from ray.rllib.policy.policy import LEARNER_STATS_KEY, Policy
from ray.rllib.utils.typing import TrainerConfigDict
from ray.util.iter import LocalIterator
Esempio n. 22
0
    'grayscale': False,
    'zero_mean': False,
    'custom_preprocessor': None,
    'custom_model': None,
    'custom_action_dist': None,
    'custom_options': {}
}

# episode_len_ls = []
# for i in range(11):
#     trainer = DQNTrainer(config=config, env=Coach)
#     episode_states, episode_actions = simulate_episode(trainer, np.array([0, 0 ,0]))
#     episode_len_ls.append(len(episode_actions))
#     print(Counter(episode_actions))
# print('mean len:', np.mean(episode_len_ls))

#### training #####
trainer = DQNTrainer(config=config, env=Coach)
for i in range(6666):
    print('train iteration', i)
    trainer.train()
###################

episode_len_ls = []
for i in range(33):
    episode_states, episode_actions = simulate_episode(trainer,
                                                       np.array([0, 0, 0]))
    episode_len_ls.append(len(episode_actions))
    print(Counter(episode_actions))
print('mean len:', np.mean(episode_len_ls))
Esempio n. 23
0
episode_len_ls = []
episode_actions_ls = []
for i in range(20):
    trainer = DQNTrainer(config=config, env=Coach)
    episode_states, episode_actions = simulate_episode(trainer, np.array([0]))
    episode_len_ls.append(len(episode_actions))
    episode_actions_ls += episode_actions
print(Counter(episode_actions_ls))
print('mean len:', np.mean(episode_len_ls))
print('median len:', np.median(episode_len_ls))


trainer = DQNTrainer(config=config, env=Coach)
for train_iter in range(22):
    print('train_iter', train_iter)
    result = trainer.train()
    # print(pretty_print(result))


# import ipdb; ipdb.set_trace()
episode_len_ls = []
episode_actions_ls = []
for i in range(66):
    episode_states, episode_actions = simulate_episode(trainer, np.array([0]))
    episode_len_ls.append(len(episode_actions))
    episode_actions_ls += episode_actions
print(Counter(episode_actions_ls))
print('mean len:', np.mean(episode_len_ls))
print('median len:', np.median(episode_len_ls))

Esempio n. 24
0
config = {
    "env": LOREnv1,
    "gamma": 0.9,
    "num_workers": 0,
    "num_envs_per_worker": 4,
    "rollout_fragment_length": 10,
    "train_batch_size": 500,
    "multiagent": {
        "policies_to_train": ["learned"],
        "policies": {
            "LORHeuristic":
            (LORHeuristic, env.observation_space, env.action_space, {}),
            "learned": (None, env.observation_space, env.action_space, {
                "model": {
                    "use_lstm": True
                },
            }),
        },
        "policy_mapping_fn": select_policy,
    },
}

trainer_obj = DQNTrainer(config=config)
env = trainer_obj.workers.local_worker().env
for _ in range(100):
    results = trainer_obj.train()
    #print(results)

    #if _ % 100 == 0:
    print(env.player1_score, env.player2_score)
Esempio n. 25
0
                                     "log_level": "INFO",
                                     "framework": args.framework,
                                 }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "sample_batch_size": 1000,
                                     "train_batch_size": 4000,
                                     "framework": args.framework,
                                 }))
    else:
        raise ValueError("--run must be DQN or PPO")

    checkpoint_path = CHECKPOINT_FILE.format(args.run)

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(checkpoint_path):
        checkpoint_path = open(checkpoint_path).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint = trainer.save()
        print("Last checkpoint", checkpoint)
        with open(checkpoint_path, "w") as f:
            f.write(checkpoint)
Esempio n. 26
0
    def run_dqn(self, config):
        # RAY tmp
        temp_dir_full_path_obj = Path(self.ray_temp_dir).resolve()
        temp_dir_full_path_obj.mkdir(parents=True, exist_ok=True)
        temp_dir_full_path = str(temp_dir_full_path_obj)
        # Result paths
        result_dir_path_root = Path(self.run_result_dir).resolve()
        # Separate MDDE output and Ray output
        result_dir_path_ray_obj = result_dir_path_root.joinpath("ray")
        result_dir_path_ray_obj.mkdir(parents=True, exist_ok=True)
        result_dir_path_ray = str(result_dir_path_ray_obj)
        result_dir_path_mdde_obj = result_dir_path_root.joinpath("mdde")
        result_dir_path_mdde_obj.mkdir(parents=True, exist_ok=True)
        result_dir_path_mdde = str(result_dir_path_mdde_obj)
        # Config
        config_file_full_path = str(Path(self.mdde_registry_config).resolve())
        # MDDE tmp
        temp_env_dir = self.env_temp_dir

        os.makedirs(os.path.abspath(temp_env_dir), exist_ok=True)

        ray.init(
            num_gpus=0,
            num_cpus=4,
            #temp_dir=temp_dir_full_path
        )

        mdde_config = ConfigEnvironment(tmp_dir=temp_env_dir,
                                        result_dir=result_dir_path_mdde)

        def make_env(host: str,
                     port: int,
                     reg_config: str,
                     env_config: ConfigEnvironment,
                     write_stats: bool,
                     initial_benchmark: bool = False,
                     do_nothing: bool = True) -> Environment:
            """
            Configure MDDE environment to run default.
            :param host: MDDE registry host or IP.
            :param port: MDDE registry control port.
            :param reg_config: Path to MDDE registry config.
            :param env_config: Environment configuration object.
            :param write_stats: True to write additional analytics info.
            :param initial_benchmark: Execute benchmark immediately upon execution.
            :param do_nothing: Enable or disable the agents' "do_nothing" action.
            :return: MDDE Environment.
            """

            # Ray is peculiar in the way it handles environments, passing a pre-configured environment might cause
            # unexpected behavior. Customize the code of this extension if more complex environment are needed

            # Create Registry client
            tcp_client = RegistryClientTCP(host, port)
            read_client: PRegistryReadClient = tcp_client
            write_client: PRegistryWriteClient = tcp_client
            ctrl_client: PRegistryControlClient = tcp_client

            # Registry configuration
            config_container = ConfigRegistry()
            config_container.read(reg_config)

            # Create agents
            agents = list()
            idx = 0
            for node in config_container.get_nodes():
                agents.append(
                    SingleNodeDefaultAgent(agent_name=node.id,
                                           agent_id=idx,
                                           data_node_id=node.id,
                                           write_stats=write_stats,
                                           allow_do_nothing=do_nothing))
                idx += 1

            # Create scenario
            scenario = DefaultScenario(
                num_fragments=20,
                num_steps_before_bench=config.bench_psteps,
                agents=agents,
                benchmark_clients=config.bench_clients,
                write_stats=write_stats)  # Number of YCSB threads

            # Create environment
            environment = Environment(config=env_config,
                                      scenario=scenario,
                                      registry_ctrl=ctrl_client,
                                      registry_write=write_client,
                                      registry_read=read_client,
                                      write_stats=write_stats)
            # Re-generate data
            environment.initialize_registry(with_benchmark=initial_benchmark)

            return environment

        def obs_shaper_2d_box(obs):
            """Reshapes the environment into a form suitable for 2D box. Example 1.
            Note: Guaranteed to work only with the Default agent - Default scenario combination."""
            # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments):
            # a_1: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)]
            # a_2: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)]
            # Hint: 2D array where rows are agents, and attributes in columns are as shown above.
            return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]),
                               order='F')

        def obs_shaper_flat_box(obs):
            """Reshapes the environment into a form suitable for 2D 'flat' box. Example 2.
            Note: Guaranteed to work only with the Default agent - Default scenario combination."""
            # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments):
            # [0-4(a_1: allocation) 5-9(a_1: popularity) 10-14(a_1: ownership binary flag)
            #  15-19(a_2: allocation) 20-24(a_2: popularity) 25-29(a_2: ownership binary flag)]
            return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') \
                .reshape((obs.shape[0] * obs.shape[1] * obs.shape[2]), order='C')

        sample_selected_shaper = obs_shaper_flat_box
        """Observation shaper selected. Set None if you want to use the default one in the wrapper."""

        # Create and initialize environment before passing it to Ray
        # This makes it impossible to run multiple instances of the environment, however it's intentional due to the
        # the nature of the environment that's represented as a distributed infrastructure of services, it can't be
        # easily created and destroyed as a simple local game-like environment
        env_instance = MddeMultiAgentEnv(
            env=make_env(host=self.mdde_registry_host,
                         port=self.mdde_registry_port,
                         reg_config=config_file_full_path,
                         env_config=mdde_config,
                         write_stats=False,
                         initial_benchmark=False,
                         do_nothing=config.do_nothing),
            observation_shaper=sample_selected_shaper)

        def env_creator(kvargs):
            env = make_env(**kvargs)
            return MddeMultiAgentEnv(env=env,
                                     observation_shaper=sample_selected_shaper)

        register_env("mdde", env_creator)

        # generate policies based on the created environment instance
        def gen_policy(i):
            return (None, env_instance.observation_space_dict[i],
                    env_instance.action_space_dict[i], {
                        "agent_id": i,
                        "obs_space_dict":
                        env_instance.observation_space_dict[i],
                        "act_space_dict": env_instance.action_space_dict[i],
                    })

        policies = {
            "policy_%d" % i: gen_policy(i)
            for i in env_instance.action_space_dict.keys()
        }
        policy_ids = list(policies.keys())

        def policy_mapping_fn(agent_id):
            return policy_ids[agent_id]

        exp_name = "DQN_MDDE_DEBUG"
        exp_config = {
            # === Log ===
            "log_level": "ERROR",

            # === Environment ===
            "env_config": {
                "host": self.mdde_registry_host,
                "port": self.mdde_registry_port,
                "reg_config": config_file_full_path,
                "env_config": mdde_config,
                "write_stats": True,
                "do_nothing": config.do_nothing
            },
            "num_envs_per_worker": 1,
            "horizon": config.ep_len,

            # === Policy Config ===
            # --- Model ---
            "n_step": 1,
            #"gamma": config.gamma,

            # --- Replay buffer ---
            "buffer_size": config.buffer_size,

            # --- Optimization ---
            "lr": config.lr,
            "learning_starts": config.learning_starts,
            "train_batch_size": self.TRAIN_BATCH_SIZE,
            "batch_mode": "truncate_episodes",

            # --- Parallelism ---
            "num_workers": 0,
            "num_gpus": 0,
            "num_gpus_per_worker": 0,

            # === Multi-agent setting ===
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": ray.tune.function(policy_mapping_fn)
            },
        }

        if config.debug:  # Run DQN within the same process (useful for debugging)
            dqn_trainer = DQNTrainer(env="mdde", config=exp_config)
            for step in range(0, config.num_episodes * config.ep_len):
                dqn_trainer.train()
        else:
            trainer = DQNTrainer
            run_experiments(
                {
                    exp_name: {
                        "run": trainer,
                        "env": "mdde",
                        "stop": {
                            "episodes_total": config.num_episodes,
                        },
                        "checkpoint_freq": 0,
                        "local_dir": result_dir_path_ray,
                        "restore": False,
                        "config": exp_config
                    },
                },
                verbose=0,
                reuse_actors=False
            )  # reuse_actors=True - messes up the results
Esempio n. 27
0
            "gamma": 0.95,
            "n_step": 3,
            "framework": "torch" if args.torch or args.mixed_torch_tf else "tf"
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")
        result_dqn = dqn_trainer.train()
        print(pretty_print(result_dqn))

        # improve the PPO policy
        print("-- PPO --")
        result_ppo = ppo_trainer.train()
        print(pretty_print(result_ppo))

        # Test passed gracefully.
        if args.as_test and \
                result_dqn["episode_reward_mean"] > args.stop_reward and \
                result_ppo["episode_reward_mean"] > args.stop_reward:
            print("test passed (both agents above requested reward)")
            quit(0)

        # swap weights to synchronize
Esempio n. 28
0
def dqn_train(config, reporter):
    # Modify default optimizer to return the batch after each step
    config["optimizer_class"] = "CustomSyncReplayOptimizer"
    setattr(optimizers, "CustomSyncReplayOptimizer", CustomSyncReplayOptimizer)

    # Instantiate a trainer
    cfg = {
        # "n_step"                    : 3,
        # "buffer_size"               : 100000,
        # "sample_batch_size"         : 32,
        # "train_batch_size"          : 128,
        # "learning_starts"           : 5000,
        # "target_network_update_freq": 5000,
        "timesteps_per_iteration": 1000,
        # "num_workers"               : cpu_count(),
        # "per_worker_exploration"    : True,
        # "worker_side_prioritization": True,
        # "min_iter_time_s"           : 1,
    }
    trainer = DQNTrainer(config={**config, **cfg}, env="House")

    # Modify training loop to receive batches from the optimizer
    # and return custom info in the training result dict
    def _custom_train(self):
        start_timestep = self.global_timestep

        # Update worker explorations
        exp_vals = [self.exploration0.value(self.global_timestep)]
        self.local_evaluator.foreach_trainable_policy(
            lambda p, _: p.set_epsilon(exp_vals[0]))
        for i, e in enumerate(self.remote_evaluators):
            exp_val = self.explorations[i].value(self.global_timestep)
            e.foreach_trainable_policy.remote(
                lambda p, _: p.set_epsilon(exp_val))
            exp_vals.append(exp_val)

        # Do optimization steps
        start = time.time()
        extra_metrics = defaultdict(lambda: defaultdict(list))
        metrics = ['comfort_penalty', 'cost']
        metrics += [f'{r}_temperature' for r in self.local_evaluator.env.rooms]
        while (self.global_timestep - start_timestep <
               self.config["timesteps_per_iteration"]
               ) or time.time() - start < self.config["min_iter_time_s"]:
            info_dict = self.optimizer.step()
            info_dict = info_dict.policy_batches['default_policy'].data
            for metric in metrics:
                for episode_id, info in zip(info_dict['eps_id'],
                                            info_dict['infos']):
                    extra_metrics[metric][str(episode_id)].append(info[metric])

            self.update_target_if_needed()

        if self.config["per_worker_exploration"]:
            # Only collect metrics from the third of workers with lowest eps
            result = self.collect_metrics(
                selected_evaluators=self.
                remote_evaluators[-len(self.remote_evaluators) // 3:])
        else:
            result = self.collect_metrics()

        result.update(timesteps_this_iter=self.global_timestep -
                      start_timestep,
                      info=dict(
                          {
                              "min_exploration": min(exp_vals),
                              "max_exploration": max(exp_vals),
                              "num_target_updates": self.num_target_updates,
                          }, **self.optimizer.stats()))

        result['extra_metrics'] = extra_metrics

        return result

    trainer._train = partial(_custom_train, trainer)

    while True:
        result = trainer.train()  # Executes one training step
        # print(pretty_print(result))
        reporter(**result)  # notifies TrialRunner