Esempio n. 1
0
def train(env_name):
    ModelCatalog.register_custom_model("masked_actions_model",
                                       MaskedActionsCNN)
    model_config = {
        "custom_model": "masked_actions_model",
        "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]],
        "conv_activation": "elu",
        "fcnet_hiddens": [128],
        "fcnet_activation": "elu",
    }
    tune_config = {
        "num_workers": 24,
        "num_gpus": 1,
        "batch_mode": "complete_episodes",
        "model": model_config,
        "env": env_name,
        "lr": 0.001,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping,
        },
        "framework": "tf"
    }
    trainer = DQNTrainer(env=env_name, config=tune_config)
    for i in range(1000):
        print("== Iteration {}==".format(i))
        results = trainer.train()
        pretty_print(results)
        checkpoint = trainer.save()
        print("\nCheckpoint saved at {}\n".format(checkpoint))
def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap",
                 lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-dqn",
            },
            "learning_starts": 500,
            # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report.
            # "record_env": True,
            "train_batch_size": 16,
            # Use a very small buffer to reduce memory usage, default: 50_000.
            "buffer_size": 1000,
            # Dueling off
            "dueling": False,
            # No hidden layers
            "hiddens": [],
            # Don't save experiences.
            # "output": None,
            # "compress_observations": True,
            "num_workers": 0,
            "num_gpus": 0.5,
            "rollout_fragment_length": 50,
        })

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('dqn_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(
            f'----------------------- Starting epoch {i} ----------------------- '
        )
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('DQN DuckieTown-MultiMap')
Esempio n. 3
0
def train(num_iters, checkpoint_freq):
    obs_space = spaces.Dict({
        'obs':
        spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32),
        'action_mask':
        spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32)
    })
    act_space = spaces.Discrete(n=5)

    trainer = DQNTrainer(
        env='SUMOEnv-v0',
        config={
            'model': {
                'custom_model': 'adaptive-trafficlight',
                'custom_options': {},
            },
            'multiagent': {
                'policy_graphs': {
                    'default_policy_graph': (
                        DQNPolicyGraph,
                        obs_space,
                        act_space,
                        {},
                    ),
                },
                'policy_mapping_fn':
                function(lambda _: 'default_policy_graph'),
            },
            'hiddens': [],  # Don't postprocess the action scores
            'callbacks': {
                'on_episode_end': function(on_episode_end),
            },
            # 'num_workers': 4,
            # 'num_gpus_per_worker': 0.25,  # All workers on a single GPU
            'timesteps_per_iteration': 20000,
        })

    for i in range(num_iters):
        print(f'== Iteration {i}==')
        print(pretty_print(trainer.train()))

        if i % checkpoint_freq == 0:
            checkpoint = trainer.save()
            print(f'\nCheckpoint saved at {checkpoint}\n')
def train_model(args, config):
    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    # https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DQN specifically there are also additionally these keys:
    # https://docs.ray.io/en/master/rllib-algorithms.html#dqn
    trainer = DQNTrainer(
        env="DuckieTown-MultiMap",
        config=config,
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    best_mean_reward = -np.inf
    epoch_of_best_mean_reward = 0
    path_of_best_mean_reward = None

    for i in trange(args.epochs, desc="Epochs",
                    leave=False):  # Number of episodes (basically epochs)
        # print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        # print(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        if result["episode_reward_mean"] > best_mean_reward:
            best_mean_reward = result["episode_reward_mean"]
            epoch_of_best_mean_reward = i
            path_of_best_mean_reward = checkpoint_path

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))

    return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
Esempio n. 5
0
def ray_server(run='PPO', address=ADDRESS, port=PORT):
    print(ray.init(log_to_driver=False))

    connector_config = {
        "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)),
        "num_workers": 0,
        "input_evaluation": [],
        "create_env_on_driver": False,
        "num_gpus": FLAGS.num_gpus,
    }

    if run == "DQN":
        trainer = DQNTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_DQN))
    elif run == "PPO":
        trainer = PPOTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_PPO))
    else:
        raise ValueError("--run must be DQN or PPO")

    i = 0
    while i < FLAGS.iter:
        i += 1
        print(pretty_print(trainer.train()))
    ray.shutdown()

    checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/')))
    print("checkpoint saved at", checkpoint)
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "config.json"),
        os.path.join(FLAGS.train_url, "config.json"))
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "customize_service.py"),
        os.path.join(FLAGS.train_url, "customize_service.py"))
    mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"),
                  os.path.join(FLAGS.train_url, "rl_config.py"))

    del trainer
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config', type=str, default='config/global_config.json', help='config file')
    parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference', action="store_true", help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch', type=int, default=100, help='number of training epochs')
    parser.add_argument('--num_step', type=int, default=10 ** 3,
                        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency')
    parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency')
    parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state')
    parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)
    
    trainer = DQNTrainer(
        env=CityflowGymEnv,
        config=config_agent)
    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if (i+1) % 100 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Esempio n. 7
0
                                     "log_level": "INFO",
                                     "framework": args.framework,
                                 }))
    elif args.run == "PPO":
        # Example of using PPO (does NOT support off-policy actions).
        trainer = PPOTrainer(env=env,
                             config=dict(
                                 connector_config, **{
                                     "sample_batch_size": 1000,
                                     "train_batch_size": 4000,
                                     "framework": args.framework,
                                 }))
    else:
        raise ValueError("--run must be DQN or PPO")

    checkpoint_path = CHECKPOINT_FILE.format(args.run)

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(checkpoint_path):
        checkpoint_path = open(checkpoint_path).read()
        print("Restoring from checkpoint path", checkpoint_path)
        trainer.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint = trainer.save()
        print("Last checkpoint", checkpoint)
        with open(checkpoint_path, "w") as f:
            f.write(checkpoint)
Esempio n. 8
0
def main():

    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    ### dw ###
    #parser.add_argument("--num-agents", type=int, default=6)

    model_dir = "model/{}_{}".format(args.algo, date)
    result_dir = "result/{}_{}".format(args.algo, date)

    config_env = env_config(args)

    num_agents = len(config_env["intersection_id"])
    '''
    obs_space = Tuple([
        CityFlowEnvRay.observation_space for _ in range(num_agents)
    ])
    act_space = Tuple([
        CityFlowEnvRay.action_space for _ in range(num_agents)
    ])
    '''

    ### dw ###
    obs_space = CityFlowEnvRay.observation_space
    act_space = CityFlowEnvRay.action_space

    ray.tune.register_env('gym_cityflow',
                          lambda env_config: CityFlowEnvRay(env_config))

    #config_agent = agent_config(config_env)

    # # build cityflow environment
    '''
    trainer = DQNTrainer(
        env=CityFlowEnvRay,
        config=config_agent)
    '''

    policies = {
        #"dqn_policy":(None, obs_space, act_space, config_env)
        #"policy_{}".format(i): (None, obs_space, act_space, config_env)
        "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {})
        for i in range(num_agents)
    }
    policy_ids = list(policies.keys())

    config_agent = agent_config(config_env, policies, policy_ids)

    trainer = DQNTrainer(env='gym_cityflow', config=config_agent)

    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
if __name__ == "__main__":
    ray.init()
    register_env("ECglass-v2", lambda _: ECglassServing())

    # We use DQN since it supports off-policy actions, but you can choose and
    # configure any agent.
    dqn = DQNTrainer(
        env="ECglass-v2",
        config={
            # Use a single process to avoid needing to set up a load balancer
            "num_workers": 0,
            # Configure the agent to run short iterations for debugging
            "exploration_fraction": 0.01,
            "learning_starts": 100,
            "timesteps_per_iteration": 200,
        })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)
Esempio n. 10
0
                connector_config, **{
                    "sample_batch_size": 1000,
                    "train_batch_size": 4000,
                    "framework": args.framework,
                }))
    else:
        raise ValueError("--run must be DQN or PPO")

	#this file contains checkpoint file path
    checkpoint_path_file = CHECKPOINT_FILE

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(checkpoint_path_file):
        checkpoint_path = open(checkpoint_path_file).read()
        if os.path.exists(checkpoint_path):
        	print("Restoring from checkpoint path", checkpoint_path)
        	trainer.restore(checkpoint_path)
        else:
        	print("checkpoint file does not exist")
	else:
		print("file containing checkpoint file path does not exist")
		
    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint = trainer.save(parser.modDir)
        print("Last checkpoint", checkpoint)
        with open(checkpoint_path, "w") as f:
            f.write(checkpoint)
            
Esempio n. 11
0
                "timesteps_per_iteration": 200,
                "env_config": {
                    "observation_size": args.observation_size,
                    "action_size": args.action_size,
                },
            })
    elif args.run == "PG":
        trainer = PGTrainer(
            env="srv",
            config={
                "num_workers": 0,
                "env_config": {
                    "observation_size": args.observation_size,
                    "action_size": args.action_size,
                },
            })

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(args.checkpoint_file):
        checkpoint_file = open(args.checkpoint_file).read()
        print("Restoring from checkpoint path", checkpoint_file)
        trainer.restore(checkpoint_file)

    # Serving and training loop
    while True:
        print(pretty_print(trainer.train()))
        checkpoint_file = trainer.save()
        print("Last checkpoint", checkpoint_file)
        with open(args.checkpoint_file, "w") as f:
            f.write(checkpoint_file)
Esempio n. 12
0
    "buffer_size": 50000,
    "sample_batch_size": 4,
    "train_batch_size": 320,
    "schedule_max_timesteps": 2000000,
    "exploration_final_eps": 0.01,
    "exploration_fraction": 0.1,
    "model": {
        "dim": 64
    }
})


def env_creator(env_config):
    return PodWorldEnv(max_steps=100, reward_factor=1.0)


register_env("podworld_env", env_creator)
agent = DQNTrainer(config=config, env="podworld_env")
agent_save_path = None

for i in range(50):
    stats = agent.train()
    # print(pretty_print(stats))
    if i % 10 == 0 and i > 0:
        path = agent.save()
        if agent_save_path is None:
            agent_save_path = path
            print('Saved agent at', agent_save_path)
    logger.write((i, stats['episode_reward_min']))
    print('episode_reward_mean', stats['episode_reward_min'])