Beispiel #1
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        init_w = trainer.get_policy('policy_01').get_weights()
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-25_08-56-43eo23nmho\\checkpoint_002360\\checkpoint-2360'
        )
        trainer.workers.foreach_worker(
            lambda w: w.get_policy('policy_01').set_weights(init_w))
        trainer.restore('.\\kill-policy-0\\checkpoint')
        trainer.import_model()
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    #trainer.get_policy('policy_01').export_model(f'./model-{iter}')
                    trainer.export_policy_model(f'./model-{iter}/main',
                                                'policy_01')
                    trainer.export_policy_model(f'./model-{iter}/collect',
                                                'policy_collect')
                    trainer.export_policy_model(f'./model-{iter}/destroy',
                                                'policy_destroy')
                    trainer.export_policy_model(f'./model-{iter}/kill',
                                                'policy_kill')

                else:
                    print("model already saved")
Beispiel #2
0
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRaySinglePlayerEnvironment(board_size=13,
                                              num_players=4,
                                              agent=agent)

    env = environment_creater()
    tune.register_env("tron_single_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    #trainer = DQNTrainer(config, "tron_single_player")
    trainer = PPOTrainer(config, "tron_single_player")

    trainer.restore("./ppo_checkpoint_201/checkpoint-201")

    return trainer  #.get_policy("trainer")
Beispiel #3
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-26_20-15-082mjvde9i\\checkpoint_008980\\checkpoint-8980'
        )
        iter = 0

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")
Beispiel #4
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770'
        )
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 250 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")
Beispiel #5
0
def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    trainer = PPOTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-ppo",
            },
            "sgd_minibatch_size": 64,
            "output": None,
            "compress_observations": True,
            "num_workers": 0,
        }
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('ppo_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('PPO DuckieTown-MultiMap')
Beispiel #6
0
def build_bot():
    ray.init(local_mode=True)
    trainer = PPOTrainer(env=ExternalAtari, config=dict(**CONFIG_PPO))
    model_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'ckpts')
    last_iter = 0
    for name in os.listdir(model_dir):
        print(name)
        it = int(name.split('_')[1])
        if it > last_iter:
            last_iter = it
    print(
        os.listdir(
            os.path.join(os.path.abspath(os.path.dirname(__file__)),
                         'ckpts/checkpoint_{}'.format(last_iter))))
    trainer.restore(
        os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            'ckpts/checkpoint_{}/checkpoint-{}'.format(last_iter, last_iter)))
    return trainer
def get_trainer(checkpoint_path=None, extra_config=None, num_workers=10):
    config = dict(
        num_gpus=0,
        num_workers=num_workers,
        num_cpus_per_worker=1,
        horizon=1000,
        lr=0.0,
        batch_mode="complete_episodes",
        callbacks=DrivingCallbacks,
        # explore=False,  # Add this line to only use mean for action.

        # Setup the correct environment
        env=GeneralizationRacing,
        env_config=dict(environment_num=10000))
    if extra_config:
        config.update(extra_config)
    trainer = PPOTrainer(config=config)
    if checkpoint_path is not None:
        trainer.restore(os.path.expanduser(checkpoint_path))
    return trainer
Beispiel #8
0
def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # DiscreteWrapper just converts wheel velocities to high level discrete actions.
        return DiscreteWrapper(simulator.Simulator(
            map_name=args.map,
            max_steps=2000,
        ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    # register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = PPOTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-ppo",
            },
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')
Beispiel #9
0
def my_train_fn(config, reporter):
    # Train for n iterations with high LR
    agent1 = PPOTrainer(env="CartPole-v0", config=config)
    for _ in range(10):
        result = agent1.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()

    # Train for n iterations with low LR
    config["lr"] = 0.0001
    agent2 = PPOTrainer(env="CartPole-v0", config=config)
    agent2.restore(state)
    for _ in range(10):
        result = agent2.train()
        result["phase"] = 2
        result["timesteps_total"] += phase1_time  # keep time moving forward
        reporter(**result)
    agent2.stop()
Beispiel #10
0
def get_trainer(friction, checkpoint_path=None, extra_config=None):
    config = dict(
        num_gpus=0,
        num_workers=10,
        num_cpus_per_worker=1,
        horizon=1000,
        lr=0.0,
        batch_mode="complete_episodes",
        callbacks=DrivingCallbacks,

        # Setup the correct environment
        env=GeneralizationRacing,
        env_config=dict(
            # The start seed is default to 0, so the test environments are unseen before.
            environment_num=200,
            vehicle_config=dict(wheel_friction=friction)))
    if extra_config:
        config.update(extra_config)
    trainer = PPOTrainer(config=config)
    if checkpoint_path is not None:
        trainer.restore(os.path.expanduser(checkpoint_path))
    return trainer
def train(config, checkpoint_dir=None):
    trainer = PPOTrainer(config=config)

    if checkpoint_dir:
        trainer.load_checkpoint(checkpoint_dir)

    chk_freq = 10

    if useModelFromLowLevelTrain:
        config_low["num_workers"] = 0
        config_low["num_envs_per_worker"] = 1
        config_low["num_gpus"] = 1
        agentLow = PPOTrainer(config_low)
        agentLow.restore(
            "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".
            format(experiment_name, experiment_id, checkpoint_num,
                   checkpoint_num))
        lowWeight = agentLow.get_policy().get_weights()
        highWeight = trainer.get_policy("low_level_policy").get_weights()
        lowState = agentLow.get_policy().get_state()
        importedOptState = OrderedDict([
            (k.replace("default_policy", "low_level_policy"), v)
            for k, v in lowState["_optimizer_variables"].items()
        ])
        importedPolicy = {
            hw: lowWeight[lw]
            for hw, lw in zip(highWeight.keys(), lowWeight.keys())
        }
        importedPolicy["_optimizer_variables"] = importedOptState
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)
from ray.tune import function
import pickle
from collections import OrderedDict

from train_config import config_hier, config_low, single_env

if __name__ == "__main__":
    ray.shutdown()
    ray.init(ignore_reinit_error=True)

    agentLow = PPOTrainer(config_low)
    experiment_name = "HWalk_Low_Mimic"
    experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39"
    checkpoint_num = "1930"
    agentLow.restore(
        "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format(
            experiment_name, experiment_id, checkpoint_num, checkpoint_num))

    # agent.export_policy_model("out/model", "default_policy")
    # agent.import_model("out/model")

    # agent.get_policy("default_policy").import_model_from_h5

    agentHigh = PPOTrainer(config_hier)
    lowWeight = agentLow.get_policy().get_weights()
    highWeight = agentHigh.get_policy("low_level_policy").get_weights()
    importedPolicy = {
        hw: lowWeight[lw]
        for hw, lw in zip(highWeight.keys(), lowWeight.keys())
    }
    s1 = agentLow.get_policy().get_state()
Beispiel #13
0
                    "episodes_this_iter": "train_episodes",
                    "policy_reward_mean/main": "reward",
                    "win_rate": "win_rate",
                    "league_size": "league_size",
                },
                sort_by_metric=True,
            ),
        )

    # Restore trained trainer (set to non-explore behavior) and play against
    # human on command line.
    if args.num_episodes_human_play > 0:
        num_episodes = 0
        trainer = PPOTrainer(config=dict(config, **{"explore": False}))
        if args.from_checkpoint:
            trainer.restore(args.from_checkpoint)
        else:
            checkpoint = results.get_last_checkpoint()
            if not checkpoint:
                raise ValueError("No last checkpoint found in results!")
            trainer.restore(checkpoint)

        # Play from the command line against the trained agent
        # in an actual (non-RLlib-wrapped) open-spiel env.
        human_player = 1
        env = Environment(args.env)

        while num_episodes < args.num_episodes_human_play:
            print("You play as {}".format("o" if human_player else "x"))
            time_step = env.reset()
            while not time_step.last():
        config['rollout_fragment_length'] = rollout_fragment_length

    adjust_config(config, spec['run'])

    if args.mode == "load":
        adjust_config_for_loading(config, spec['run'])

        if spec["run"] == "PPO":
            from ray.rllib.agents.ppo import PPOTrainer as Trainer
        else:
            raise NotImplementedError("Not a supported algorithm")

        trainer = Trainer(env=env_module.env_cls, config=config)

        if args.checkpoint is not None:
            trainer.restore(args.checkpoint)

        env_module.rm.initialize()

        env = env_module.env_cls(config['env_config'])
        cam = env_module.default_cam()

        renderer = env_module.EnvRenderer(trainer=trainer, env=env, cam=cam)
        renderer.run()
    else:
        tune.run(
            spec['run'],
            name=spec['name'],
            stop=spec['stop'],
            local_dir=spec['local_dir'],
            checkpoint_freq=spec['checkpoint_freq'],
Beispiel #15
0
agent_cfg[
    "shuffle_sequences"] = True  # Whether to shuffle sequences in the batch when training
agent_cfg[
    "grad_clip"] = None  # Clamp the norm of the gradient during optimization (None to disable)

# ====================== Run the optimization ======================

agent_cfg["lr"] = 1.0e-4
agent_cfg["lr_schedule"] = None

train_agent = Trainer(agent_cfg, "env", logger_creator)
checkpoint_path = train(train_agent, max_timesteps=100000)

# ===================== Enjoy the trained agent ======================

test_agent = Trainer(agent_cfg, "env", logger_creator)
test_agent.restore(checkpoint_path)
test(test_agent, explore=False)

# =================== Terminate Ray backend ====================

train_agent.stop()
test_agent.stop()
ray.shutdown()

# =================== Terminate the Ray backend ====================

train_agent.stop()
test_agent.stop()
ray.shutdown()
Beispiel #16
0
            }
        },
        "multiagent": {
            "policies": env_list,
            "policy_mapping_fn": lambda agent_id: agent_id
        },
        "lr": 3e-4,
        "num_sgd_iter": 5,
        "vf_loss_coeff": 0.0003,
        "log_level": "WARN",
        "clip_param": 10.0,
        "vf_clip_param": 10.0
    }

    trainer = PPOTrainer(env="fire_mage", config=rnn_config)
    trainer.restore('./checkpoints_iter_24/checkpoint_138/checkpoint-138')
    #trainer.restore('./checkpoints_iter_20/checkpoint_325/checkpoint-325')
    #trainer.restore('./checkpoints_iter_13/checkpoint_193/checkpoint-193')
    #trainer.restore('./checkpoints_iter_12/checkpoint_206/checkpoint-206')

    state_list = []
    for key, val in env_list.items():
        dummy_model = RNNModel(val[1],
                               val[2],
                               0,
                               rnn_config['model'],
                               'happy')
        state = dummy_model.get_initial_state()
        state_list.append((key, [s.detach().numpy() for s in state]))
    state_list = dict(state_list)
    iters = 100
    best_ckpt = 1

    ckpt_to_restore = None
    # Restore the latest checkpoint if exist:
    for ckpt in os.listdir(ckpt_dir):
        if ckpt == ".gitkeep":
            continue
        ckpt_indx = int(ckpt.split("_")[1])
        if ckpt_indx > best_ckpt:
            best_ckpt = ckpt_indx
    if best_ckpt > 1:
        ckpt_to_restore = os.path.join(ckpt_dir,
                                       "checkpoint_" + str(best_ckpt),
                                       "checkpoint-" + str(best_ckpt))
        trainer.restore(ckpt_to_restore)
        print("Checkpoint number " + str(best_ckpt) + " restored")
    else:
        print("No checkpoint found, Training starting from scratch...")

    # Serving and training loop
    env = trainer.env_creator({})
    # obs_state = {}
    # obs_state["obs"] = obs[list(obs.keys())[0]]
    player1 = Connect4Config.PLAYER1
    player1_id = Connect4Config.PLAYER1_ID
    player2 = Connect4Config.PLAYER2
    player2_id = Connect4Config.PLAYER2_ID
    actual_player = player1
    actual_player_id = player1_id
    obs = env.reset(player1_id)
Beispiel #18
0
    return env


env = env_creator()
env_name = 'pistonball_v4'
register_env(env_name, lambda config: PettingZooEnv(env_creator()))

with open(params_path, "rb") as f:
    config = pickle.load(f)
    # num_workers not needed since we are not training
    del config['num_workers']
    del config['num_gpus']

ray.init(num_cpus=8, num_gpus=1)
PPOagent = PPOTrainer(env=env_name, config=config)
PPOagent.restore(checkpoint_path)


reward_sum = 0
frame_list = []
i = 0
env.reset()

for agent in env.agent_iter():
    observation, reward, done, info = env.last()
    reward_sum += reward
    if done:
        action = None
    else:
        action, _, _ = PPOagent.get_policy("policy_0").compute_single_action(observation)
Beispiel #19
0
def run_saved(args):
    if args.OSM[0] == 1 and args.OSM[1] == 0:
        setting = "RLvsOSM"
    elif args.OSM[0] == 1 and args.OSM[1] == 1:
        setting = "OSMvsOSM"
    else:
        setting = "RL{0}".format(len(args.alphas) - sum(args.honest))
    if args.save_path == 'none':
        checkpointnum = 0
    else:
        checkpointnum = args.save_path.split('-')[-1]
    env_name = "{setting}_{spirit}_{blocks}_{alpha:04d}_{spy}_{checkpointnum}".format(
        spirit=int(args.team_spirit * 100),
        blocks=int(args.blocks),
        alpha=int(args.alphas[0] * 10000),
        spy=args.spy[1],
        setting=setting,
        checkpointnum=checkpointnum)
    ray.init(local_mode=True,
             memory=700 * 1024 * 1024,
             object_store_memory=100 * 1024 * 1024,
             driver_object_store_memory=100 * 1024 * 102)
    print("Testing {0}".format(setting), env_name)

    def select_policy(agent_id):
        return agent_id

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env(env_name, lambda config: ParametricBitcoin(config))

    if args.extended:
        action_n = 6
    else:
        action_n = 4
    # define the state space, one for parties that have access to spy info and one without
    spy_state_space = constants.make_spy_space(len(args.alphas), args.blocks)
    blind_state_space = constants.make_blind_space(len(args.alphas),
                                                   args.blocks)
    policies = dict()
    osm_space = spaces.Box(
        low=np.zeros(4),
        high=np.array([args.blocks + 4, args.blocks + 4, args.blocks + 4, 3.]))
    if sum(args.OSM) > 0:
        osm = OSM_strategy(
            osm_space, spaces.Discrete(4), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks
            })

    blind_dim = 0
    for space in blind_state_space:
        blind_dim += get_preprocessor(space)(space).size

    spy_dim = 0
    for space in spy_state_space:
        spy_dim += get_preprocessor(space)(space).size

    spy_state_space_wrapped = spaces.Dict({
        "action_mask":
        spaces.Box(0, 1, shape=(action_n, )),
        "avail_actions":
        spaces.Box(-10, 10, shape=(action_n, action_n)),
        "bitcoin":
        spaces.Box(0, np.inf, shape=(spy_dim, ))
    })
    blind_state_space_wrapped = spaces.Dict({
        "action_mask":
        spaces.Box(0, 1, shape=(action_n, )),
        "avail_actions":
        spaces.Box(-10, 10, shape=(action_n, action_n)),
        "bitcoin":
        spaces.Box(0, np.inf, shape=(blind_dim, ))
    })
    preps = [None for i in range(len(args.alphas))]
    for i in range(len(args.alphas)):
        if args.spy[i] == 1:
            policies[str(i)] = (None, spy_state_space_wrapped,
                                spaces.Discrete(action_n), {
                                    "model": {
                                        "use_lstm": args.use_lstm,
                                        "custom_model": "pa_model",
                                        "custom_options": {
                                            "parties": len(args.alphas),
                                            "spy": True,
                                            "blocks": args.blocks,
                                            "extended": args.extended
                                        }
                                    }
                                })
            preps[i] = get_preprocessor(spy_state_space_wrapped)(
                spy_state_space_wrapped)
        elif args.OSM[i] == 1:
            policies[str(i)] = (OSM_strategy, osm_space, spaces.Discrete(4), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks
            })
        elif args.honest[i] == 1:
            policies[str(i)] = (Honest, osm_space, spaces.Discrete(6), {
                'alpha': args.alphas[0],
                'gamma': args.gammas[0],
                'blocks': args.blocks,
                'fiftyone': args.fiftyone[i],
                'extended': args.extended
            })
        else:
            policies[str(i)] = (None, blind_state_space_wrapped,
                                spaces.Discrete(action_n), {
                                    "model": {
                                        "use_lstm": args.use_lstm,
                                        "custom_model": "pa_model",
                                        "custom_options": {
                                            "parties": len(args.alphas),
                                            "spy": False,
                                            "blocks": args.blocks,
                                            "extended": args.extended
                                        }
                                    }
                                })
            preps[i] = get_preprocessor(blind_state_space_wrapped)(
                blind_state_space_wrapped)
    env_config = {
        'max_hidden_block': args.blocks,
        'alphas': args.alphas,
        'gammas': args.gammas,
        'ep_length': args.ep_length,
        'print': args.debug,
        'spy': args.spy,
        'team_spirit': args.team_spirit,
        'OSM': args.OSM,
        'extended': args.extended,
        'honest': args.honest,
    }
    policies_to_train = [
        str(i) for i in range(len(args.alphas))
        if args.OSM[i] != 1 and args.honest[i] != 1
    ]
    env = ParametricBitcoin(env_config=env_config)
    if len(policies_to_train) != 0:
        if args.trainer == 'PPO':
            trainer = PPOTrainer(env=BitcoinEnv,
                                 config={
                                     "num_workers": 0,
                                     "multiagent": {
                                         "policies_to_train":
                                         policies_to_train,
                                         "policies": policies,
                                         "policy_mapping_fn": select_policy,
                                     },
                                     "env_config": env_config
                                 })
        else:
            trainer = DQNTrainer(env=env_name,
                                 config={
                                     "eager": True,
                                     "multiagent": {
                                         "policies_to_train":
                                         policies_to_train,
                                         "policies": policies,
                                         "policy_mapping_fn": select_policy,
                                     },
                                     "env_config": env_config
                                 })
            model = trainer.get_policy().model
            print(model.base_model.summary())
        print("Restoring model")
        trainer.restore(args.save_path)
    loaded_policies = dict()
    for k in range(len(args.alphas)):
        if args.OSM[k] == 1:
            loaded_policies[str(k)] = osm
        elif args.honest[k] == 1:
            honest = Honest(
                osm_space,
                spaces.Discrete(6),
                {
                    'alpha': args.alphas[0],
                    'gamma': args.gammas[0],
                    'blocks': args.blocks,
                    'fiftyone': args.fiftyone[k],
                    'extended': args.extended
                },
            )
            loaded_policies[str(k)] = honest
            preps[k] = None
        else:
            loaded_policies[str(k)] = trainer.get_policy(str(k))
    trials = 100000
    reslist = []
    for j in range(3):
        blocks = np.zeros(len(args.alphas) + 1)
        event_blocks = np.zeros(len(args.alphas) + 1)
        action_dist = {
            str(i): np.zeros(action_n)
            for i in range(len(args.alphas))
        }
        res = dict()
        for i in range(trials):
            obs = env.reset()
            isDone = False
            RNNstates = {str(i): [] for i in range(len(args.alphas))}
            while not isDone:
                action_dict = dict()
                for k in range(len(policies)):
                    prep = preps[k]
                    if not prep:
                        action_dict[str(k)], _, _ = loaded_policies[str(
                            k)].compute_single_action(obs=obs[str(k)],
                                                      state=[])
                    else:
                        action_dict[str(k)], _, _ = loaded_policies[str(
                            k)].compute_single_action(obs=prep.transform(
                                obs[str(k)]),
                                                      state=[])
                    action_dist[str(k)][action_dict[str(k)]] += 1
                obs, _, done, _ = env.step(action_dict)
                isDone = done['__all__']
            if i == 0 and j == 0:
                with open(
                        os.path.join('/afs/ece/usr/charlieh/eval_results',
                                     env_name + '_trace.txt'), 'w+') as f:
                    f.write(env.wrapped._debug_string)
            blocks += env.wrapped._accepted_blocks
            event_blocks += env.wrapped._total_blocks
            total_event_blocks = np.sum(event_blocks)
            if i % 100 == 0:
                print("Relative rewards", blocks / np.sum(blocks))
                print("Relative received", event_blocks / total_event_blocks)
                for i in range(len(args.alphas)):
                    print("Action dist", str(i),
                          action_dist[str(i)] / np.sum(action_dist[str(i)]))
        res['blocks'] = blocks
        res['action dist'] = action_dist
        res['blocks norm'] = blocks / np.sum(blocks)
        res['actions norm'] = {
            str(i): action_dist[str(i)] / np.sum(action_dist[str(i)])
            for i in range(len(args.alphas))
        }
        reslist.append(res)
    np.save(os.path.join('/afs/ece/usr/charlieh/eval_results', env_name),
            reslist,
            allow_pickle=True)
Beispiel #20
0
                "model": {
                    "custom_model": "3rd_model",
                    "use_lstm": True,
                }
            }),
        },
        "policy_mapping_fn": (lambda agent_id: "ppo_policy"),
        "policies_to_train": ["ppo_policy"],
    },
},
                       env=v0.RllibPomme)

# fdb733b6
checkpoint = 950
checkpoint_dir = "/home/nhatminh2947/ray_results/3rd_model_no_wood_static/PPO_PommeMultiAgent_283d4406_0_2020-03-24_04-09-09mjgzr90e"
ppo_agent.restore("{}/checkpoint_{}/checkpoint-{}".format(
    checkpoint_dir, checkpoint, checkpoint))

agents_list = [
    agents.StaticAgent(),
    agents.StaticAgent(),
    agents.StaticAgent(),
    agents.StaticAgent()
]
env_id = "PommeTeam-nowood-v0"
env = pommerman.make(env_id, agents_list)

penv = v0.RllibPomme({
    "agent_names": agent_names,
    "env_id": env_id,
    "phase": 0
})
Beispiel #21
0
#config['seed'] = SEED

# We will use a simple convolution network with 3 layers as our feature extractor
config['model']['vf_share_layers'] = True
config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
config['model']['fcnet_hiddens'] = [256]
config['model']['custom_preprocessor'] = 'tron_prep'

# Begin training or evaluation
trainer = PPOTrainer(config, "tron_single_player")
num_epoch = 10000
test_epoch = 2

if LOAD_FROM_CHECKPOINT:
#    np.random.seed(42)
    trainer.restore("./ppo_model/checkpoint_400/checkpoint-400")
    for epoch in range(num_epoch):
        print("Training iteration: {}".format(epoch), end='')
        res = trainer.train()
        print(f", Average reward: {res['episode_reward_mean']}")

        if epoch % test_epoch == 0:
            reward = env.test(trainer)
        if epoch % 300 == 0:
            trainer.save()
    trainer.save()

else:
    for epoch in range(num_epoch):
        #print(type(trainer))
        print("Training iteration: {}".format(epoch), end='')
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRayEnvironment(board_size=13, num_players=4)

    env = environment_creater()
    tune.register_env("tron_multi_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # All of the models will use the same network as before
    agent_config = {
        "model": {
            "vf_share_layers": True,
            "conv_filters": [(512, 5, 1), (256, 3, 2), (128, 3, 2)],
            "fcnet_hiddens": [256],
            "custom_preprocessor": 'tron_prep'
        }
    }

    def policy_mapping_function(x):
        if x == '0':
            return "trainer"
        return "opponent"

    config['multiagent'] = {
        "policy_mapping_fn": policy_mapping_function,
        "policies": {
            "trainer":
            (None, env.observation_space, env.action_space, agent_config),
            "opponent":
            (None, env.observation_space, env.action_space, agent_config)
        },
        "policies_to_train": ["trainer"]
    }

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    #trainer = MARWILTrainer(config, "tron_single_player")
    trainer = PPOTrainer(config, "tron_multi_player")

    trainer.restore("./sp_checkpoint_1802/checkpoint-1802")

    return trainer.get_policy("trainer")
    NUM_GPUS = args.num_gpus
    TOTAL_STEPS = int(args.total_steps)
    launch_script = "./launchClient_quiet.sh"

    register_env(ENV_NAME, create_env)

    # update config with evaluation resources and switch exploration off
    config = get_config(checkpoint_file)
    config["num_workers"] = args.num_workers
    config["num_gpus"] = args.num_gpus
    config["explore"] = False

    # Load agent
    ray.init()
    trainer = PPOTrainer(config)
    trainer.restore(checkpoint_file)
    policy = trainer.get_policy()

    # Start Malmo instances
    GAME_INSTANCE_PORTS = [COMMAND_PORT + i for i in range(NUM_WORKERS)]
    instances = launch_minecraft(GAME_INSTANCE_PORTS,
                                 launch_script=launch_script)

    # Connect to the Java instances
    env = create_env(config)

    # Custom evaluation loop
    print(f"running evaluations for {EPISODES} episodes")
    for ep in range(EPISODES):
        state = env.reset()
        done = False
Beispiel #24
0
ppo_config[
    'num_workers'] = 4  # noptepochs (int) Number of epoch when optimizing the surrogate
ppo_config[
    'clip_param'] = 0.2  # cliprange (float or callable) Clipping parameter, it can be a function
ppo_config[
    'vf_clip_param'] = 1  # cliprange_vf = None? --  (float or callable) Clipping parameter for the value function,
# it can be a function. This is a parameter specific to the OpenAI implementation. If None is passed (default), then
# cliprange (that is used for the policy) will be used. IMPORTANT: this clipping depends on the reward scaling. To
# deactivate value function clipping (and recover the original PPO implementation), you have to pass a negative value
# (e.g. -1).
ppo_config['env_config'] = env_config
ppo_config['train_batch_size'] = 4000
ppo_config['explore'] = False

PPO_agent = PPOTrainer(config=ppo_config, env=SSA_Tasker_Env)
PPO_agent.restore(ppo_checkpoint)
PPO_agent.get_policy().config['explore'] = False

logdir = '/home/ash/ray_results/ssa_experiences/agent_visible_greedy_spoiled/' + str(
    env_config['rso_count']) + 'RSOs_jones_flatten_10000episodes/'

marwil_config = MARWIL_CONFIG.copy()
marwil_config['evaluation_num_workers'] = 1
marwil_config['env_config'] = env_config
marwil_config['evaluation_interval'] = 1
marwil_config['evaluation_config'] = {'input': 'sampler'}
marwil_config['beta'] = 1  # 0
marwil_config['input'] = logdir
marwil_config['env_config'] = env_config
marwil_config['explore'] = False
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping,
        "policies_to_train": policies_to_train,
    },
    "observation_filter": "NoFilter",
    "clip_actions": False,
    "framework": "torch"
},
                       env="MinerEnv-v0")

id = 2050
checkpoint_dir = "/home/lucius/ray_results/gold_miner_2/PPO_MinerEnv-v0_0_2020-09-13_00-54-26q3mjnpej"
checkpoint = "{}/checkpoint_{}/checkpoint-{}".format(checkpoint_dir, id, id)

ppo_agent.restore(checkpoint)

for i in range(8):
    mem_size = 0
    weights = ppo_agent.get_policy(f"policy_{i}").get_weights()
    for key in weights:
        parameters = 1
        for value in weights[key].shape:
            parameters *= value

        mem_size += parameters

        weights[key] = torch.tensor(weights[key])
    print(mem_size)
    torch.save(
        weights,
Beispiel #26
0

config['multiagent'] = {
    "policy_mapping_fn": policy_mapping_function,
    "policies": {
        "trainer":
        (None, env.observation_space, env.action_space, agent_config),
        "opponent":
        (None, env.observation_space, env.action_space, agent_config)
    },
    "policies_to_train": ["trainer"]
}

trainer = PPOTrainer(config, "tron_multi_player")
#trainer.restore("./desktop_version/checkpoint_1802/checkpoint-1802")
trainer.restore("./ppo_selfplay/sp_checkpoint_2257/checkpoint-2257")

num_epoch = 1000
save_epochs = 50
update_times = 0
#update_percentage = update_times * 0.01
epoch_update = 0

for epoch in range(num_epoch):
    print("Training iteration: {}".format(epoch), end='\t')
    res = trainer.train()
    win_percentage = (res["policy_reward_mean"]["trainer"] -
                      res["episode_len_mean"]) / 11 - 10 / 11 + 1
    print("Win percentage: ", win_percentage, end='\t')
    print("Average reward: ", res["policy_reward_mean"]["trainer"])
    update_percentage = update_times * 0.01
Beispiel #27
0
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }
    results = tune.run(
        args.run, config=config, stop=stop, verbose=2, checkpoint_at_end=True)

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)

    checkpoints = results.get_trial_checkpoints_paths(
        trial=results.get_best_trial("episode_reward_mean", mode="max"),
        metric="episode_reward_mean")

    checkpoint_path = checkpoints[0][0]
    trainer = PPOTrainer(config)
    trainer.restore(checkpoint_path)

    # Inference loop.
    env = StatelessCartPole()

    # Run manual inference loop for n episodes.
    for _ in range(10):
        episode_reward = 0.0
        reward = 0.0
        action = 0
        done = False
        obs = env.reset()
        while not done:
            # Create a dummy action using the same observation n times,
            # as well as dummy prev-n-actions and prev-n-rewards.
            action, state, logits = trainer.compute_single_action(
Beispiel #28
0
        config = {**env_config, **agent_config, **general_config}
        agent = PPOTrainer(config=config)
    elif args.run == "SAC":
        agent_config = config_SAC
        config = {**env_config, **agent_config, **general_config}
        agent = SACTrainer(config=config)
    elif args.run == "DDPG":
        agent_config = config_DDPG
        config = {**env_config, **agent_config, **general_config}
        agent = DDPGTrainer(config=config)

    #  '/home/david/ray_results/SAC/SAC_FarmEnv_ff600_00000_0_2021-02-06_14-34-11/checkpoint_50/checkpoint-50'

    checkpoint_path = '/home/david/ray_results/SAC/SAC_FarmEnv_305d2_00000_0_2021-03-24_08-40-22/checkpoint_10/checkpoint-10'

    agent.restore(checkpoint_path=checkpoint_path)

    font = pygame.font.Font('freesansbold.ttf', 20)
    textX = 10
    textY = 10
    # arrow indicating wind direction
    arrow_Img = pygame.image.load('wind-compass.png')
    arrow_x = 250
    arrow_y = 5

    screen = pygame.display.set_mode((800, 600))

    pygame.display.set_caption('WindAI')

    def update_env():
Beispiel #29
0
config['num_workers'] = 1
config['num_gpus'] = 1
config['framework'] = "torch"
config['gamma'] = 0.1

config['monitor'] = False

# PPO config ...
# config['lr'] = 1e-4
# config['train_batch_size']
config['model']['dim'] = 21
config['model']['conv_filters'] = [[8, [4, 4], 2], [16, [2, 2], 2],
                                   [512, [6, 6], 1]]  #,
#[config['train_batch_size'], 4, 1, 1]]

# trainner = PPOTrainer(config=config, env="mars_explorer:explorer-v01")
trainner = PPOTrainer(config=config, env="custom-explorer")
# import pdb; pdb.set_trace()

PATH = "/home/dkoutras/ray_results/290_out_of_400/checkpoint_2991/checkpoint-2991"
trainner.restore(PATH)
import pdb
pdb.set_trace()

for _ in range(10):
    initial_time = time.time()
    result = trainner.train()
    print(
        f"mean:{result['episode_reward_mean']} time:{time.time() - initial_time:.2f}[sec]"
    )
        verbose=1,
        checkpoint_freq=1,
        checkpoint_at_end=True,
    )
    print("Pre-training done.")

    best_checkpoint = results.get_best_checkpoint(
        results.trials[0], mode="max")
    print(f".. best checkpoint was: {best_checkpoint}")

    # Create a new dummy Trainer to "fix" our checkpoint.
    new_trainer = PPOTrainer(config=config)
    # Get untrained weights for all policies.
    untrained_weights = new_trainer.get_weights()
    # Restore all policies from checkpoint.
    new_trainer.restore(best_checkpoint)
    # Set back all weights (except for 1st agent) to original
    # untrained weights.
    new_trainer.set_weights(
        {pid: w
         for pid, w in untrained_weights.items() if pid != "policy_0"})
    # Create the checkpoint from which tune can pick up the
    # experiment.
    new_checkpoint = new_trainer.save()
    new_trainer.stop()
    print(".. checkpoint to restore from (all policies reset, "
          f"except policy_0): {new_checkpoint}")

    print("Starting new tune.run")

    # Start our actual experiment.