def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap",
                 lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            "learning_starts": 500,
            # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report.
            # "record_env": True,
            "train_batch_size": 16,
            # Use a very small buffer to reduce memory usage, default: 50_000.
            "buffer_size": 1000,
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
            # Don't save experiences.
            # "output": None,
            # "compress_observations": True,
            "num_workers": 0,
            "num_gpus": 0.5,
            "rollout_fragment_length": 50,
        })

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('dqn_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(
            f'----------------------- Starting epoch {i} ----------------------- '
        )
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('DQN DuckieTown-MultiMap')
Esempio n. 2
0
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRaySinglePlayerEnvironment(board_size=13,
                                              num_players=4,
                                              agent=agent)

    env = environment_creater()
    tune.register_env("tron_single_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED                                              f

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    trainer = DQNTrainer(config, "tron_single_player")
    #trainer = PPOTrainer(config, "tron_single_player")

    trainer.restore("./dqn_checkpoint_3800/checkpoint-3800")

    return trainer  #.get_policy("trainer")
def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # DiscreteWrapper just converts wheel velocities to high level discrete actions.
        return DiscreteWrapper(
            simulator.Simulator(
                map_name=args.map,
                max_steps=2000,
            ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = DQNTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')
def train_model(args, config):
    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config=config,
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    best_mean_reward = -np.inf
    epoch_of_best_mean_reward = 0
    path_of_best_mean_reward = None

    for i in trange(args.epochs, desc="Epochs",
                    leave=False):  # Number of episodes (basically epochs)
        # print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        # print(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        if result["episode_reward_mean"] > best_mean_reward:
            best_mean_reward = result["episode_reward_mean"]
            epoch_of_best_mean_reward = i
            path_of_best_mean_reward = checkpoint_path

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))

    return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
Esempio n. 5
0
                                     "log_level": "INFO",
                                     "framework": args.framework,
                                 }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "sample_batch_size": 1000,
                                     "train_batch_size": 4000,
                                     "framework": args.framework,
                                 }))
    else:
        raise ValueError("--run must be DQN or PPO")

    checkpoint_path = CHECKPOINT_FILE.format(args.run)

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(checkpoint_path):
        checkpoint_path = open(checkpoint_path).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint = trainer.save()
        print("Last checkpoint", checkpoint)
        with open(checkpoint_path, "w") as f:
            f.write(checkpoint)
if __name__ == "__main__":
    ray.init()
    register_env("ECglass-v2", lambda _: ECglassServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    dqn = DQNTrainer(
        env="ECglass-v2",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # Configure the agent to run short iterations for debugging
            "exploration_fraction": 0.01,
            "learning_starts": 100,
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)
Esempio n. 7
0
 # env = VectorEnv.wrap(existing_envs=[warehouse_env_creator(env_config) for _ in range(NUM_ENVS)],
 #                      num_envs=NUM_ENVS)
 # config = {"env": "warehouse_env",
 #           "framework": "torch",
 #           "num_gpus": 0.1,
 #           "num_gpus_per_worker": 0.1,
 #           'num_envs_per_worker': 6,
 #           "evaluation_interval": 5, }
 with open(params_path, "rb") as f:
     config = cloudpickle.load(f)
 config["explore"] = False
 config['num_envs_per_worker'] = 1
 print("Trained on map: \n", config["env_config"]["maps"])
 config["env_config"]["maps"] = MAP_WITH_EXCEPTION
 trainer = DQNTrainer(config=config)
 trainer.restore(path.format(checkpoint, checkpoint))
 policy = trainer.get_policy()
 trainer._evaluate()
 samples = (trainer.evaluation_workers.local_worker().sample()
            for _ in range(NUM_EPISODES))
 rows = map(lambda x: np.concatenate([
     x["unroll_id"][:, None],
     np.arange(0, x.count)[:,None],
     x["obs"],
     x["actions"][:, None],
     x["q_values"],
     x["rewards"][:, None],
     x["dones"][:, None],
     x["new_obs"],
     process_info(x["infos"])],
     -1),
Esempio n. 8
0
if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_gpus=1)
    env_config = {"board_shape": [8, 8], "length": 3}
    config = {
        "env": SnakeEnv,
        "env_config": env_config,
        "num_gpus": 1,
        "lr": 1e-4,
        "hiddens": [32, 64, 512]
    }
    agent = DQNTrainer(config=config)
    snake_env = SnakeEnv(config=env_config)
    if args.test:
        assert args.restore is not None
        agent.restore(args.restore)
        while True:
            score = simulate_one_game(render=True)
            print("Score: {}".format(score))
    else:
        if args.restore is not None:
            agent.restore(args.restore)
            i = agent.iteration
        else:
            i = 0
        while True:
            train_one_step()
            if i % 10 == 0:
                save_ckpt()
                # avg_score = 0
                # for _ in range(100):
Esempio n. 9
0
register_env("leduc_holdem", lambda config: PettingZooEnv(env_creator()))

env = (env_creator())
# obs_space = env.observation_space
# print(obs_space)
# act_space = test_env.action_space

with open(params_path, "rb") as f:
    config = pickle.load(f)
    # num_workers not needed since we are not training
    del config['num_workers']
    del config['num_gpus']

ray.init(num_cpus=8, num_gpus=0)
DQNAgent = DQNTrainer(env="leduc_holdem", config=config)
DQNAgent.restore(checkpoint_path)

reward_sums = {a: 0 for a in env.possible_agents}
i = 0
env.reset()

for agent in env.agent_iter():
    observation, reward, done, info = env.last()
    obs = observation['observation']
    reward_sums[agent] += reward
    if done:
        action = None
    else:
        print(DQNAgent.get_policy(agent))
        policy = DQNAgent.get_policy(agent)
        batch_obs = {
Esempio n. 10
0
    config['framework'] = "torch"
    config['gamma'] = args.gamma

    # NN vision
    config['model']['dim'] = 21
    config['model']['conv_filters'] = [[8, [3, 3], 2], [16, [2, 2], 2],
                                       [512, [6, 6], 1]]
    #  DQN config
    config['v_min'] = -400
    config['v_max'] = 400
    config['noisy'] = False
    trainner = DQNTrainer(config=config, env="mars_explorer:explorer-v01")

    if PATH != "":
        print(f"Loading model {PATH}")
        trainner.restore(PATH)
    else:
        print(f"Starting without any a priori knowledge")
    N_start = 0
    N_finish = args.steps
    results = []
    episode_data = []
    episode_json = []

    writer = SummaryWriter(comment="SAC-GEP")

    for batch in range(N_start, N_finish):

        initial_time = time.time()

        result = trainner.train()
Esempio n. 11
0
            # For eval afterward
            config_copy = config.copy()
            config_copy['explore'] = False
            trainer = DDPGTrainer(config=config_copy, env='Bertrand')

        analysis = tune.run(
            trainer_choice,
            # num_samples = 4,
            config=config,
            local_dir='./log',
            stop={'training_iteration': sessions},
            mode='max',
            metric='episode_reward_mean',
            checkpoint_at_end=True)

        trainer.restore(checkpoint_path=analysis.best_checkpoint)

        # analysis = tune.run(
        #     trainer_choice,
        #     # num_samples = 4,
        #     config = config_copy,
        #     local_dir = './log',
        #     stop = {'training_iteration': sessions},
        #     mode = 'max',
        #     metric = 'episode_reward_mean',
        #     restore = analysis.best_checkpoint,
        #     checkpoint_at_end = True
        # )
    else:
        # Dual algorithm training
Esempio n. 12
0
                "timesteps_per_iteration": 200,
                "env_config": {
                    "observation_size": args.observation_size,
                    "action_size": args.action_size,
                },
            })
    elif args.run == "PG":
        trainer = PGTrainer(
            env="srv",
            config={
                "num_workers": 0,
                "env_config": {
                    "observation_size": args.observation_size,
                    "action_size": args.action_size,
                },
            })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(args.checkpoint_file):
        checkpoint_file = open(args.checkpoint_file).read()
        print("Restoring from checkpoint path", checkpoint_file)
        trainer.restore(checkpoint_file)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint_file = trainer.save()
        print("Last checkpoint", checkpoint_file)
        with open(args.checkpoint_file, "w") as f:
            f.write(checkpoint_file)