Python DQNTrainer.DQNTrainer Exemples, ray.rllib.agents.dqn.DQNTrainer.DQNTrainer Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_callbacks.py Projet : longtermrisk/marltoolbox

def dqn_trainer_wt_amtft_policies_in_ipd(
    logger_creator,
):
    train_n_replicates = 1
    debug = True
    hparams = get_hyperparameters(
        debug,
        train_n_replicates,
        filter_utilitarian=False,
        env="IteratedPrisonersDilemma",
    )

    _, _, rllib_config = get_rllib_config(
        hparams, welfare_fn=postprocessing.WELFARE_UTILITARIAN
    )

    rllib_config["env"] = IteratedPrisonersDilemma
    rllib_config["seed"] = int(time.time())

    policies = rllib_config["multiagent"]["policies"]
    for policy_id, policy_tuple in policies.items():
        policy_list = list(policy_tuple)
        policy_list[0] = amTFT.AmTFTRolloutsTorchPolicy
        policies[policy_id] = policy_list

    dqn_trainer = DQNTrainer(rllib_config, logger_creator=logger_creator)
    return dqn_trainer

Exemple #2

0

Afficher le fichier

Fichier : discrete_dqn.py Projet : balbok0/cse571-sp21-project-2-dropouts

def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap",
                 lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            "learning_starts": 500,
            # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report.
            # "record_env": True,
            "train_batch_size": 16,
            # Use a very small buffer to reduce memory usage, default: 50_000.
            "buffer_size": 1000,
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
            # Don't save experiences.
            # "output": None,
            # "compress_observations": True,
            "num_workers": 0,
            "num_gpus": 0.5,
            "rollout_fragment_length": 50,
        })

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('dqn_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(
            f'----------------------- Starting epoch {i} ----------------------- '
        )
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('DQN DuckieTown-MultiMap')

Exemple #3

0

Afficher le fichier

Fichier : train.py Projet : Mika412/sumo-modular-framework

def train(env_name):
    ModelCatalog.register_custom_model("masked_actions_model",
                                       MaskedActionsCNN)
    model_config = {
        "custom_model": "masked_actions_model",
        "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]],
        "conv_activation": "elu",
        "fcnet_hiddens": [128],
        "fcnet_activation": "elu",
    }
    tune_config = {
        "num_workers": 24,
        "num_gpus": 1,
        "batch_mode": "complete_episodes",
        "model": model_config,
        "env": env_name,
        "lr": 0.001,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping,
        },
        "framework": "tf"
    }
    trainer = DQNTrainer(env=env_name, config=tune_config)
    for i in range(1000):
        print("== Iteration {}==".format(i))
        results = trainer.train()
        pretty_print(results)
        checkpoint = trainer.save()
        print("\nCheckpoint saved at {}\n".format(checkpoint))

Exemple #4

0

Afficher le fichier

Fichier : ray_experiment.py Projet : olixbridge/Space-Lander-Implement-AI-2019-First-Place-Winner-

def dqn_train(config, reporter):
    # Instantiate a trainer
    cfg = {
        # Max num timesteps for annealing schedules. Exploration is annealed from
        # 1.0 to exploration_fraction over this number of timesteps scaled by
        # exploration_fraction
        "schedule_max_timesteps": 1000000,
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": 1000,
        # Fraction of entire training period over which the exploration rate is
        # annealed
        "exploration_fraction": 0.1,
        # Final value of random action probability
        "exploration_final_eps": 0.02,
        "n_step": 3,
        "buffer_size": 500000,
        # "sample_batch_size"         : 32,
        # "train_batch_size"          : 128,
        # "learning_starts"           : 5000,
        # "target_network_update_freq": 5000,
        # "num_workers"               : NUM_WORKERS,
        # "per_worker_exploration"    : True,
        # "worker_side_prioritization": True,
        # "min_iter_time_s"           : 1,
    }
    trainer = DQNTrainer(config={**config, **cfg})

    while True:
        result = trainer.train()  # Executes one training step
        print(pretty_print(result))
        reporter(**result)  # notifies TrialRunner

Exemple #5

0

Afficher le fichier

Fichier : tester.py Projet : AnesBenmerzoug/Machine-Learning-Projects

 def test_model(self) -> Tuple[List[float], list]:
     ray.init(logging_level=logging.INFO, ignore_reinit_error=True)
     agent = DQNTrainer(self.config, env=custom_env_name)
     weights = torch.load(
         self.params.model_dir / "trained_model.pt",
         map_location=lambda storage, loc: storage,
     )
     agent.set_weights({"default_policy": weights})
     rewards = []
     longest_screens = []
     for i in range(self.params.num_testing_episodes):
         screens = []
         try:
             logger.info("Iteration: {}", i)
             state = self.env.reset()
             done = False
             cumulative_reward = 0
             while not done:
                 action = agent.compute_action(state)
                 state, reward, done, _ = self.env.step(action)
                 screen = self.env.render(mode="rgb_array")
                 screens.append(screen)
                 cumulative_reward += reward
                 time.sleep(0.01)
             logger.info("Iteration: {}, Reward: {}", i, cumulative_reward)
             rewards.append(cumulative_reward)
         except KeyboardInterrupt:
             logger.info("Testing was interrupted")
             break
         if len(screens) > len(longest_screens):
             longest_screens = screens
     self.env.close()
     ray.shutdown()
     return rewards, longest_screens

Exemple #6

0

Afficher le fichier

Fichier : test_reproducibility.py Projet : wuisawesome/ray

    def test_reproducing_trajectory(self):
        class PickLargest(gym.Env):
            def __init__(self):
                self.observation_space = gym.spaces.Box(low=float("-inf"),
                                                        high=float("inf"),
                                                        shape=(4, ))
                self.action_space = gym.spaces.Discrete(4)

            def reset(self, **kwargs):
                self.obs = np.random.randn(4)
                return self.obs

            def step(self, action):
                reward = self.obs[action]
                return self.obs, reward, True, {}

        def env_creator(env_config):
            return PickLargest()

        for fw in framework_iterator(frameworks=("tf", "torch")):
            trajs = list()
            for trial in range(3):
                ray.init()
                register_env("PickLargest", env_creator)
                config = {
                    "seed": 666 if trial in [0, 1] else 999,
                    "min_time_s_per_reporting": 0,
                    "timesteps_per_iteration": 100,
                    "framework": fw,
                }
                agent = DQNTrainer(config=config, env="PickLargest")

                trajectory = list()
                for _ in range(8):
                    r = agent.train()
                    trajectory.append(r["episode_reward_max"])
                    trajectory.append(r["episode_reward_min"])
                trajs.append(trajectory)

                ray.shutdown()

            # trial0 and trial1 use same seed and thus
            # expect identical trajectories.
            all_same = True
            for v0, v1 in zip(trajs[0], trajs[1]):
                if v0 != v1:
                    all_same = False
            self.assertTrue(all_same)

            # trial1 and trial2 use different seeds and thus
            # most rewards tend to be different.
            diff_cnt = 0
            for v1, v2 in zip(trajs[1], trajs[2]):
                if v1 != v2:
                    diff_cnt += 1
            self.assertTrue(diff_cnt > 8)

Exemple #7

0

Afficher le fichier

 def testTrainCartpoleOffPolicy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"),
                                                 off_pol_frac=0.2))
     dqn = DQNTrainer(env="test3", config={"exploration_fraction": 0.001})
     for i in range(100):
         result = dqn.train()
         print("Iteration {}, reward {}, timesteps {}".format(
             i, result["episode_reward_mean"], result["timesteps_total"]))
         if result["episode_reward_mean"] >= 100:
             return
     raise Exception("failed to improve reward")

Exemple #8

0

Afficher le fichier

def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRaySinglePlayerEnvironment(board_size=13,
                                              num_players=4,
                                              agent=agent)

    env = environment_creater()
    tune.register_env("tron_single_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED                                              f

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    trainer = DQNTrainer(config, "tron_single_player")
    #trainer = PPOTrainer(config, "tron_single_player")

    trainer.restore("./dqn_checkpoint_3800/checkpoint-3800")

    return trainer  #.get_policy("trainer")

Exemple #9

0

Afficher le fichier

 def testEvaluationOption(self):
     ray.init()
     agent = DQNTrainer(env="CartPole-v0",
                        config={"evaluation_interval": 2})
     r0 = agent.train()
     r1 = agent.train()
     r2 = agent.train()
     r3 = agent.train()
     r4 = agent.train()
     self.assertTrue("evaluation" in r0)
     self.assertTrue("episode_reward_mean" in r0["evaluation"])
     self.assertEqual(r0["evaluation"], r1["evaluation"])
     self.assertNotEqual(r1["evaluation"], r2["evaluation"])
     self.assertEqual(r2["evaluation"], r3["evaluation"])
     self.assertNotEqual(r3["evaluation"], r4["evaluation"])

Exemple #10

0

Afficher le fichier

Fichier : discrete_dqn.py Projet : balbok0/cse571-sp21-project-2-dropouts

def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # DiscreteWrapper just converts wheel velocities to high level discrete actions.
        return DiscreteWrapper(
            simulator.Simulator(
                map_name=args.map,
                max_steps=2000,
            ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = DQNTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')

Exemple #11

0

Afficher le fichier

    def train(config, checkpoint_dir=None):
        trainer = DQNTrainer(config=config, env='BomberMan-v0')
        # trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770')
        iter = 0

        # def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 250 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")

Exemple #12

0

Afficher le fichier

Fichier : agent_dqn.py Projet : ryancliffew44/adaptive-tls

def train(num_iters, checkpoint_freq):
    obs_space = spaces.Dict({
        'obs':
        spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32),
        'action_mask':
        spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32)
    })
    act_space = spaces.Discrete(n=5)

    trainer = DQNTrainer(
        env='SUMOEnv-v0',
        config={
            'model': {
                'custom_model': 'adaptive-trafficlight',
                'custom_options': {},
            },
            'multiagent': {
                'policy_graphs': {
                    'default_policy_graph': (
                        DQNPolicyGraph,
                        obs_space,
                        act_space,
                        {},
                    ),
                },
                'policy_mapping_fn':
                function(lambda _: 'default_policy_graph'),
            },
            'hiddens': [],  # Don't postprocess the action scores
            'callbacks': {
                'on_episode_end': function(on_episode_end),
            },
            # 'num_workers': 4,
            # 'num_gpus_per_worker': 0.25,  # All workers on a single GPU
            'timesteps_per_iteration': 20000,
        })

    for i in range(num_iters):
        print(f'== Iteration {i}==')
        print(pretty_print(trainer.train()))

        if i % checkpoint_freq == 0:
            checkpoint = trainer.save()
            print(f'\nCheckpoint saved at {checkpoint}\n')

Exemple #13

0

Afficher le fichier

Fichier : test_policy.py Projet : yiranwang52/ray

 def test_policy_save_restore(self):
     config = DEFAULT_CONFIG.copy()
     for _ in framework_iterator(config):
         trainer = DQNTrainer(config=config, env="CartPole-v0")
         policy = trainer.get_policy()
         state1 = policy.get_state()
         trainer.train()
         state2 = policy.get_state()
         check(state1["_exploration_state"]["last_timestep"],
               state2["_exploration_state"]["last_timestep"],
               false=True)
         check(state1["global_timestep"],
               state2["global_timestep"],
               false=True)
         # Reset policy to its original state and compare.
         policy.set_state(state1)
         state3 = policy.get_state()
         # Make sure everything is the same.
         check(state1, state3)

Exemple #14

0

Afficher le fichier

Fichier : dqn_search.py Projet : balbok0/cse571-sp21-project-2-dropouts

def train_model(args, config):
    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config=config,
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    best_mean_reward = -np.inf
    epoch_of_best_mean_reward = 0
    path_of_best_mean_reward = None

    for i in trange(args.epochs, desc="Epochs",
                    leave=False):  # Number of episodes (basically epochs)
        # print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        # print(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        if result["episode_reward_mean"] > best_mean_reward:
            best_mean_reward = result["episode_reward_mean"]
            epoch_of_best_mean_reward = i
            path_of_best_mean_reward = checkpoint_path

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))

    return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward

Exemple #15

0

Afficher le fichier

Fichier : hetero_learner.py Projet : xuyanbo03/ModelArts-Lab

def ray_server(run='PPO', address=ADDRESS, port=PORT):
    print(ray.init(log_to_driver=False))

    connector_config = {
        "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)),
        "num_workers": 0,
        "input_evaluation": [],
        "create_env_on_driver": False,
        "num_gpus": FLAGS.num_gpus,
    }

    if run == "DQN":
        trainer = DQNTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_DQN))
    elif run == "PPO":
        trainer = PPOTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_PPO))
    else:
        raise ValueError("--run must be DQN or PPO")

    i = 0
    while i < FLAGS.iter:
        i += 1
        print(pretty_print(trainer.train()))
    ray.shutdown()

    checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/')))
    print("checkpoint saved at", checkpoint)
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "config.json"),
        os.path.join(FLAGS.train_url, "config.json"))
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "customize_service.py"),
        os.path.join(FLAGS.train_url, "customize_service.py"))
    mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"),
                  os.path.join(FLAGS.train_url, "rl_config.py"))

    del trainer

Exemple #16

0

Afficher le fichier

Fichier : test_external_env.py Projet : amrit-dev-20/Dev-Training-Ray

 def test_train_cartpole_off_policy(self):
     register_env(
         "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"),
                                                 off_pol_frac=0.2))
     config = {
         "num_workers": 0,
         "exploration_config": {
             "epsilon_timesteps": 100
         },
     }
     for _ in framework_iterator(config, frameworks=("tf", "torch")):
         dqn = DQNTrainer(env="test3", config=config)
         reached = False
         for i in range(50):
             result = dqn.train()
             print("Iteration {}, reward {}, timesteps {}".format(
                 i, result["episode_reward_mean"],
                 result["timesteps_total"]))
             if result["episode_reward_mean"] >= 80:
                 reached = True
                 break
         if not reached:
             raise Exception("failed to improve reward")

Exemple #17

0

Afficher le fichier

Fichier : ray_dqn_agent.py Projet : xiawenwen49/Multi-Commander

def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config', type=str, default='config/global_config.json', help='config file')
    parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference', action="store_true", help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch', type=int, default=100, help='number of training epochs')
    parser.add_argument('--num_step', type=int, default=10 ** 3,
                        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency')
    parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency')
    parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state')
    parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)
    
    trainer = DQNTrainer(
        env=CityflowGymEnv,
        config=config_agent)
    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if (i+1) % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)

Exemple #18

0

Afficher le fichier

                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            "explore": False,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
            "framework": "torch" if args.torch else "tf",
        })

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "gamma": 0.95,
            "n_step": 3,
            "framework": "torch" if args.torch or args.mixed_torch_tf else "tf"
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy

Exemple #19

0

Afficher le fichier

Fichier : cartpole_server.py Projet : zhuohan123/ray

        # Use a single worker process to run the server.
        "num_workers":
        0,
        # Disable OPE, since the rollouts are coming from online clients.
        "input_evaluation": [],
    }

    if args.run == "DQN":
        # Example of using DQN (supports off-policy actions).
        trainer = DQNTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "exploration_config": {
                                         "type": "EpsilonGreedy",
                                         "initial_epsilon": 1.0,
                                         "final_epsilon": 0.02,
                                         "epsilon_timesteps": 1000,
                                     },
                                     "learning_starts": 100,
                                     "timesteps_per_iteration": 200,
                                     "log_level": "INFO",
                                     "framework": args.framework,
                                 }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "sample_batch_size": 1000,
                                     "train_batch_size": 4000,
                                     "framework": args.framework,
                                 }))

Exemple #20

0

Afficher le fichier

def train(config, reporter):
    trainer = DQNTrainer(config=config, env=Coach)
    for _ in range(11):
        print(_)
        trainer.train()

Exemple #21

0

Afficher le fichier

            break
    return score


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_gpus=1)
    env_config = {"board_shape": [8, 8], "length": 3}
    config = {
        "env": SnakeEnv,
        "env_config": env_config,
        "num_gpus": 1,
        "lr": 1e-4,
        "hiddens": [32, 64, 512]
    }
    agent = DQNTrainer(config=config)
    snake_env = SnakeEnv(config=env_config)
    if args.test:
        assert args.restore is not None
        agent.restore(args.restore)
        while True:
            score = simulate_one_game(render=True)
            print("Score: {}".format(score))
    else:
        if args.restore is not None:
            agent.restore(args.restore)
            i = agent.iteration
        else:
            i = 0
        while True:
            train_one_step()

Exemple #22

0

Afficher le fichier

_____________________________________________________________________
value_out (Dense)          (None, 1)     257      fc_value_2[0][0]
=====================================================================
Total params: 134,915
Trainable params: 134,915
Non-trainable params: 0
_____________________________________________________________________
"""
# __query_action_dist_end__

# __get_q_values_dqn_start__
# Get a reference to the model through the policy
import numpy as np
from ray.rllib.agents.dqn import DQNTrainer

trainer = DQNTrainer(env="CartPole-v0", config={"framework": "tf2"})
model = trainer.get_policy().model
# <ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQModel ...>

# List of all model variables
model.variables()

# Run a forward pass to get base model output. Note that complex observations
# must be preprocessed. An example of preprocessing is examples/saving_experiences.py
model_out = model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})
# (<tf.Tensor: id=832, shape=(1, 256), dtype=float32, numpy=...)

# Access the base Keras models (all default models have a base)
model.base_model.summary()
"""
Model: "model"

Exemple #23

0

Afficher le fichier

            # as well to the DQN agent
            "observation_filter": "MeanStdFilter",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": args.framework,
        })

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "model": {
                "vf_share_layers": True,
            },
            "gamma": 0.95,
            "n_step": 3,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": args.framework,
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):

Exemple #24

0

Afficher le fichier

def main():

    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    ### dw ###
    #parser.add_argument("--num-agents", type=int, default=6)

    model_dir = "model/{}_{}".format(args.algo, date)
    result_dir = "result/{}_{}".format(args.algo, date)

    config_env = env_config(args)

    num_agents = len(config_env["intersection_id"])
    '''
    obs_space = Tuple([
        CityFlowEnvRay.observation_space for _ in range(num_agents)
    ])
    act_space = Tuple([
        CityFlowEnvRay.action_space for _ in range(num_agents)
    ])
    '''

    ### dw ###
    obs_space = CityFlowEnvRay.observation_space
    act_space = CityFlowEnvRay.action_space

    ray.tune.register_env('gym_cityflow',
                          lambda env_config: CityFlowEnvRay(env_config))

    #config_agent = agent_config(config_env)

    # # build cityflow environment
    '''
    trainer = DQNTrainer(
        env=CityFlowEnvRay,
        config=config_agent)
    '''

    policies = {
        #"dqn_policy":(None, obs_space, act_space, config_env)
        #"policy_{}".format(i): (None, obs_space, act_space, config_env)
        "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {})
        for i in range(num_agents)
    }
    policy_ids = list(policies.keys())

    config_agent = agent_config(config_env, policies, policy_ids)

    trainer = DQNTrainer(env='gym_cityflow', config=config_agent)

    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)

Exemple #25

0

Afficher le fichier

Fichier : ECglass_server.py Projet : raghu1121/DQN-ECglass-model

                                                       SERVER_PORT))
        server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
        server.serve_forever()


if __name__ == "__main__":
    ray.init()
    register_env("ECglass-v2", lambda _: ECglassServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    dqn = DQNTrainer(
        env="ECglass-v2",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # Configure the agent to run short iterations for debugging
            "exploration_fraction": 0.01,
            "learning_starts": 100,
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()

Exemple #26

0

Afficher le fichier

Fichier : sample_ray_dqn.py Projet : akharitonov/mdde

    def run_dqn(self, config):
        # RAY tmp
        temp_dir_full_path_obj = Path(self.ray_temp_dir).resolve()
        temp_dir_full_path_obj.mkdir(parents=True, exist_ok=True)
        temp_dir_full_path = str(temp_dir_full_path_obj)
        # Result paths
        result_dir_path_root = Path(self.run_result_dir).resolve()
        # Separate MDDE output and Ray output
        result_dir_path_ray_obj = result_dir_path_root.joinpath("ray")
        result_dir_path_ray_obj.mkdir(parents=True, exist_ok=True)
        result_dir_path_ray = str(result_dir_path_ray_obj)
        result_dir_path_mdde_obj = result_dir_path_root.joinpath("mdde")
        result_dir_path_mdde_obj.mkdir(parents=True, exist_ok=True)
        result_dir_path_mdde = str(result_dir_path_mdde_obj)
        # Config
        config_file_full_path = str(Path(self.mdde_registry_config).resolve())
        # MDDE tmp
        temp_env_dir = self.env_temp_dir

        os.makedirs(os.path.abspath(temp_env_dir), exist_ok=True)

        ray.init(
            num_gpus=0,
            num_cpus=4,
            #temp_dir=temp_dir_full_path
        )

        mdde_config = ConfigEnvironment(tmp_dir=temp_env_dir,
                                        result_dir=result_dir_path_mdde)

        def make_env(host: str,
                     port: int,
                     reg_config: str,
                     env_config: ConfigEnvironment,
                     write_stats: bool,
                     initial_benchmark: bool = False,
                     do_nothing: bool = True) -> Environment:
            """
            Configure MDDE environment to run default.
            :param host: MDDE registry host or IP.
            :param port: MDDE registry control port.
            :param reg_config: Path to MDDE registry config.
            :param env_config: Environment configuration object.
            :param write_stats: True to write additional analytics info.
            :param initial_benchmark: Execute benchmark immediately upon execution.
            :param do_nothing: Enable or disable the agents' "do_nothing" action.
            :return: MDDE Environment.
            """

            # Ray is peculiar in the way it handles environments, passing a pre-configured environment might cause
            # unexpected behavior. Customize the code of this extension if more complex environment are needed

            # Create Registry client
            tcp_client = RegistryClientTCP(host, port)
            read_client: PRegistryReadClient = tcp_client
            write_client: PRegistryWriteClient = tcp_client
            ctrl_client: PRegistryControlClient = tcp_client

            # Registry configuration
            config_container = ConfigRegistry()
            config_container.read(reg_config)

            # Create agents
            agents = list()
            idx = 0
            for node in config_container.get_nodes():
                agents.append(
                    SingleNodeDefaultAgent(agent_name=node.id,
                                           agent_id=idx,
                                           data_node_id=node.id,
                                           write_stats=write_stats,
                                           allow_do_nothing=do_nothing))
                idx += 1

            # Create scenario
            scenario = DefaultScenario(
                num_fragments=20,
                num_steps_before_bench=config.bench_psteps,
                agents=agents,
                benchmark_clients=config.bench_clients,
                write_stats=write_stats)  # Number of YCSB threads

            # Create environment
            environment = Environment(config=env_config,
                                      scenario=scenario,
                                      registry_ctrl=ctrl_client,
                                      registry_write=write_client,
                                      registry_read=read_client,
                                      write_stats=write_stats)
            # Re-generate data
            environment.initialize_registry(with_benchmark=initial_benchmark)

            return environment

        def obs_shaper_2d_box(obs):
            """Reshapes the environment into a form suitable for 2D box. Example 1.
            Note: Guaranteed to work only with the Default agent - Default scenario combination."""
            # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments):
            # a_1: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)]
            # a_2: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)]
            # Hint: 2D array where rows are agents, and attributes in columns are as shown above.
            return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]),
                               order='F')

        def obs_shaper_flat_box(obs):
            """Reshapes the environment into a form suitable for 2D 'flat' box. Example 2.
            Note: Guaranteed to work only with the Default agent - Default scenario combination."""
            # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments):
            # [0-4(a_1: allocation) 5-9(a_1: popularity) 10-14(a_1: ownership binary flag)
            #  15-19(a_2: allocation) 20-24(a_2: popularity) 25-29(a_2: ownership binary flag)]
            return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') \
                .reshape((obs.shape[0] * obs.shape[1] * obs.shape[2]), order='C')

        sample_selected_shaper = obs_shaper_flat_box
        """Observation shaper selected. Set None if you want to use the default one in the wrapper."""

        # Create and initialize environment before passing it to Ray
        # This makes it impossible to run multiple instances of the environment, however it's intentional due to the
        # the nature of the environment that's represented as a distributed infrastructure of services, it can't be
        # easily created and destroyed as a simple local game-like environment
        env_instance = MddeMultiAgentEnv(
            env=make_env(host=self.mdde_registry_host,
                         port=self.mdde_registry_port,
                         reg_config=config_file_full_path,
                         env_config=mdde_config,
                         write_stats=False,
                         initial_benchmark=False,
                         do_nothing=config.do_nothing),
            observation_shaper=sample_selected_shaper)

        def env_creator(kvargs):
            env = make_env(**kvargs)
            return MddeMultiAgentEnv(env=env,
                                     observation_shaper=sample_selected_shaper)

        register_env("mdde", env_creator)

        # generate policies based on the created environment instance
        def gen_policy(i):
            return (None, env_instance.observation_space_dict[i],
                    env_instance.action_space_dict[i], {
                        "agent_id": i,
                        "obs_space_dict":
                        env_instance.observation_space_dict[i],
                        "act_space_dict": env_instance.action_space_dict[i],
                    })

        policies = {
            "policy_%d" % i: gen_policy(i)
            for i in env_instance.action_space_dict.keys()
        }
        policy_ids = list(policies.keys())

        def policy_mapping_fn(agent_id):
            return policy_ids[agent_id]

        exp_name = "DQN_MDDE_DEBUG"
        exp_config = {
            # === Log ===
            "log_level": "ERROR",

            # === Environment ===
            "env_config": {
                "host": self.mdde_registry_host,
                "port": self.mdde_registry_port,
                "reg_config": config_file_full_path,
                "env_config": mdde_config,
                "write_stats": True,
                "do_nothing": config.do_nothing
            },
            "num_envs_per_worker": 1,
            "horizon": config.ep_len,

            # === Policy Config ===
            # --- Model ---
            "n_step": 1,
            #"gamma": config.gamma,

            # --- Replay buffer ---
            "buffer_size": config.buffer_size,

            # --- Optimization ---
            "lr": config.lr,
            "learning_starts": config.learning_starts,
            "train_batch_size": self.TRAIN_BATCH_SIZE,
            "batch_mode": "truncate_episodes",

            # --- Parallelism ---
            "num_workers": 0,
            "num_gpus": 0,
            "num_gpus_per_worker": 0,

            # === Multi-agent setting ===
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": ray.tune.function(policy_mapping_fn)
            },
        }

        if config.debug:  # Run DQN within the same process (useful for debugging)
            dqn_trainer = DQNTrainer(env="mdde", config=exp_config)
            for step in range(0, config.num_episodes * config.ep_len):
                dqn_trainer.train()
        else:
            trainer = DQNTrainer
            run_experiments(
                {
                    exp_name: {
                        "run": trainer,
                        "env": "mdde",
                        "stop": {
                            "episodes_total": config.num_episodes,
                        },
                        "checkpoint_freq": 0,
                        "local_dir": result_dir_path_ray,
                        "restore": False,
                        "config": exp_config
                    },
                },
                verbose=0,
                reuse_actors=False
            )  # reuse_actors=True - messes up the results

Exemple #27

0

Afficher le fichier

        # Use the connector server to generate experiences.
        "input": (
            lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT)
        ),
        # Use a single worker process to run the server.
        "num_workers": 0,
        # Disable OPE, since the rollouts are coming from online clients.
        "input_evaluation": [],
    }

    if args.run == "DQN":
        # Example of using DQN (supports off-policy actions).
        trainer = DQNTrainer(
            env=env,
            config=dict(
                connector_config, **{
                    "learning_starts": 100,
                    "timesteps_per_iteration": 200,
                    "framework": args.framework,
                }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(
            env=env,
            config=dict(
                connector_config, **{
                    "rollout_fragment_length": 1000,
                    "train_batch_size": 4000,
                    "framework": args.framework,
                }))
    else:
        raise ValueError("--run must be DQN or PPO")

Exemple #28

0

Afficher le fichier

Fichier : bitcoin_game.py Projet : rami-github/SquirRL

def run_saved(args):
    if args.OSM[0] == 1 and args.OSM[1] == 0:
        setting = "RLvsOSM"
    elif args.OSM[0] == 1 and args.OSM[1] == 1:
        setting = "OSMvsOSM"
    else:
        setting = "RL{0}".format(len(args.alphas) - sum(args.honest))
    if args.save_path == 'none':
        checkpointnum = 0
    else:
        checkpointnum = args.save_path.split('-')[-1]
    env_name = "{setting}_{spirit}_{blocks}_{alpha:04d}_{spy}_{checkpointnum}".format(
        spirit=int(args.team_spirit * 100),
        blocks=int(args.blocks),
        alpha=int(args.alphas[0] * 10000),
        spy=args.spy[1],
        setting=setting,
        checkpointnum=checkpointnum)
    ray.init(local_mode=True,
             memory=700 * 1024 * 1024,
             object_store_memory=100 * 1024 * 1024,
             driver_object_store_memory=100 * 1024 * 102)
    print("Testing {0}".format(setting), env_name)

    def select_policy(agent_id):
        return agent_id

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env(env_name, lambda config: ParametricBitcoin(config))

    if args.extended:
        action_n = 6
    else:
        action_n = 4
    # define the state space, one for parties that have access to spy info and one without
    spy_state_space = constants.make_spy_space(len(args.alphas), args.blocks)
    blind_state_space = constants.make_blind_space(len(args.alphas),
                                                   args.blocks)
    policies = dict()
    osm_space = spaces.Box(
        low=np.zeros(4),
        high=np.array([args.blocks + 4, args.blocks + 4, args.blocks + 4, 3.]))
    if sum(args.OSM) > 0:
        osm = OSM_strategy(
            osm_space, spaces.Discrete(4), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks
            })

    blind_dim = 0
    for space in blind_state_space:
        blind_dim += get_preprocessor(space)(space).size

    spy_dim = 0
    for space in spy_state_space:
        spy_dim += get_preprocessor(space)(space).size

    spy_state_space_wrapped = spaces.Dict({
        "action_mask":
        spaces.Box(0, 1, shape=(action_n, )),
        "avail_actions":
        spaces.Box(-10, 10, shape=(action_n, action_n)),
        "bitcoin":
        spaces.Box(0, np.inf, shape=(spy_dim, ))
    })
    blind_state_space_wrapped = spaces.Dict({
        "action_mask":
        spaces.Box(0, 1, shape=(action_n, )),
        "avail_actions":
        spaces.Box(-10, 10, shape=(action_n, action_n)),
        "bitcoin":
        spaces.Box(0, np.inf, shape=(blind_dim, ))
    })
    preps = [None for i in range(len(args.alphas))]
    for i in range(len(args.alphas)):
        if args.spy[i] == 1:
            policies[str(i)] = (None, spy_state_space_wrapped,
                                spaces.Discrete(action_n), {
                                    "model": {
                                        "use_lstm": args.use_lstm,
                                        "custom_model": "pa_model",
                                        "custom_options": {
                                            "parties": len(args.alphas),
                                            "spy": True,
                                            "blocks": args.blocks,
                                            "extended": args.extended
                                        }
                                    }
                                })
            preps[i] = get_preprocessor(spy_state_space_wrapped)(
                spy_state_space_wrapped)
        elif args.OSM[i] == 1:
            policies[str(i)] = (OSM_strategy, osm_space, spaces.Discrete(4), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks
            })
        elif args.honest[i] == 1:
            policies[str(i)] = (Honest, osm_space, spaces.Discrete(6), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks,
                'fiftyone': args.fiftyone[i],
                'extended': args.extended
            })
        else:
            policies[str(i)] = (None, blind_state_space_wrapped,
                                spaces.Discrete(action_n), {
                                    "model": {
                                        "use_lstm": args.use_lstm,
                                        "custom_model": "pa_model",
                                        "custom_options": {
                                            "parties": len(args.alphas),
                                            "spy": False,
                                            "blocks": args.blocks,
                                            "extended": args.extended
                                        }
                                    }
                                })
            preps[i] = get_preprocessor(blind_state_space_wrapped)(
                blind_state_space_wrapped)
    env_config = {
        'max_hidden_block': args.blocks,
        'alphas': args.alphas,
        'gammas': args.gammas,
        'ep_length': args.ep_length,
        'print': args.debug,
        'spy': args.spy,
        'team_spirit': args.team_spirit,
        'OSM': args.OSM,
        'extended': args.extended,
        'honest': args.honest,
    }
    policies_to_train = [
        str(i) for i in range(len(args.alphas))
        if args.OSM[i] != 1 and args.honest[i] != 1
    ]
    env = ParametricBitcoin(env_config=env_config)
    if len(policies_to_train) != 0:
        if args.trainer == 'PPO':
            trainer = PPOTrainer(env=BitcoinEnv,
                                 config={
                                     "num_workers": 0,
                                     "multiagent": {
                                         "policies_to_train":
                                         policies_to_train,
                                         "policies": policies,
                                         "policy_mapping_fn": select_policy,
                                     },
                                     "env_config": env_config
                                 })
        else:
            trainer = DQNTrainer(env=env_name,
                                 config={
                                     "eager": True,
                                     "multiagent": {
                                         "policies_to_train":
                                         policies_to_train,
                                         "policies": policies,
                                         "policy_mapping_fn": select_policy,
                                     },
                                     "env_config": env_config
                                 })
            model = trainer.get_policy().model
            print(model.base_model.summary())
        print("Restoring model")
        trainer.restore(args.save_path)
    loaded_policies = dict()
    for k in range(len(args.alphas)):
        if args.OSM[k] == 1:
            loaded_policies[str(k)] = osm
        elif args.honest[k] == 1:
            honest = Honest(
                osm_space,
                spaces.Discrete(6),
                {
                    'alpha': args.alphas[0],
                    'gamma': args.gammas[0],
                    'blocks': args.blocks,
                    'fiftyone': args.fiftyone[k],
                    'extended': args.extended
                },
            )
            loaded_policies[str(k)] = honest
            preps[k] = None
        else:
            loaded_policies[str(k)] = trainer.get_policy(str(k))
    trials = 100000
    reslist = []
    for j in range(3):
        blocks = np.zeros(len(args.alphas) + 1)
        event_blocks = np.zeros(len(args.alphas) + 1)
        action_dist = {
            str(i): np.zeros(action_n)
            for i in range(len(args.alphas))
        }
        res = dict()
        for i in range(trials):
            obs = env.reset()
            isDone = False
            RNNstates = {str(i): [] for i in range(len(args.alphas))}
            while not isDone:
                action_dict = dict()
                for k in range(len(policies)):
                    prep = preps[k]
                    if not prep:
                        action_dict[str(k)], _, _ = loaded_policies[str(
                            k)].compute_single_action(obs=obs[str(k)],
                                                      state=[])
                    else:
                        action_dict[str(k)], _, _ = loaded_policies[str(
                            k)].compute_single_action(obs=prep.transform(
                                obs[str(k)]),
                                                      state=[])
                    action_dist[str(k)][action_dict[str(k)]] += 1
                obs, _, done, _ = env.step(action_dict)
                isDone = done['__all__']
            if i == 0 and j == 0:
                with open(
                        os.path.join('/afs/ece/usr/charlieh/eval_results',
                                     env_name + '_trace.txt'), 'w+') as f:
                    f.write(env.wrapped._debug_string)
            blocks += env.wrapped._accepted_blocks
            event_blocks += env.wrapped._total_blocks
            total_event_blocks = np.sum(event_blocks)
            if i % 100 == 0:
                print("Relative rewards", blocks / np.sum(blocks))
                print("Relative received", event_blocks / total_event_blocks)
                for i in range(len(args.alphas)):
                    print("Action dist", str(i),
                          action_dist[str(i)] / np.sum(action_dist[str(i)]))
        res['blocks'] = blocks
        res['action dist'] = action_dist
        res['blocks norm'] = blocks / np.sum(blocks)
        res['actions norm'] = {
            str(i): action_dist[str(i)] / np.sum(action_dist[str(i)])
            for i in range(len(args.alphas))
        }
        reslist.append(res)
    np.save(os.path.join('/afs/ece/usr/charlieh/eval_results', env_name),
            reslist,
            allow_pickle=True)

Exemple #29

0

Afficher le fichier

Fichier : multi_agent_two_trainers.py Projet : amrit-dev-20/Dev-Training-Ray

            "explore": False,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch else "tf",
        })

    dqn_trainer = DQNTrainer(
        env="multi_agent_cartpole",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "gamma": 0.95,
            "n_step": 3,
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "framework": "torch" if args.torch or args.mixed_torch_tf else "tf"
        })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

Exemple #30

0

Afficher le fichier

Fichier : makepandas.py Projet : raphaelavalos/exceptionalRL

 # env_config = {'map': MAP}
 # env = VectorEnv.wrap(existing_envs=[warehouse_env_creator(env_config) for _ in range(NUM_ENVS)],
 #                      num_envs=NUM_ENVS)
 # config = {"env": "warehouse_env",
 #           "framework": "torch",
 #           "num_gpus": 0.1,
 #           "num_gpus_per_worker": 0.1,
 #           'num_envs_per_worker': 6,
 #           "evaluation_interval": 5, }
 with open(params_path, "rb") as f:
     config = cloudpickle.load(f)
 config["explore"] = False
 config['num_envs_per_worker'] = 1
 print("Trained on map: \n", config["env_config"]["maps"])
 config["env_config"]["maps"] = MAP_WITH_EXCEPTION
 trainer = DQNTrainer(config=config)
 trainer.restore(path.format(checkpoint, checkpoint))
 policy = trainer.get_policy()
 trainer._evaluate()
 samples = (trainer.evaluation_workers.local_worker().sample()
            for _ in range(NUM_EPISODES))
 rows = map(lambda x: np.concatenate([
     x["unroll_id"][:, None],
     np.arange(0, x.count)[:,None],
     x["obs"],
     x["actions"][:, None],
     x["q_values"],
     x["rewards"][:, None],
     x["dones"][:, None],
     x["new_obs"],
     process_info(x["infos"])],